Essential basic functionality

Essential basic functionality#

import polars as pl
import numpy as np
from helper.jupyter import row

index = pl.date_range(pl.date(2000, 1, 1), pl.date(2000, 1, 8), eager=True).alias('index')
s = pl.DataFrame(dict(
    index=['a', 'b', 'c', 'd', 'e'],
    value=np.random.randn(5)
))

df = pl.DataFrame(np.random.randn(8, 3), schema=['A', 'B', 'C']).insert_column(0, index)

data = '''
       a         b         c
-0.173215  0.119209 -1.044236
-0.861849 -2.104569 -0.494929
+1.071804  0.721555 -0.706771
-1.039575  0.271860 -0.424972
+0.567020  0.276232 -1.087401
-0.673690  0.113648 -1.478427
+0.524988  0.404705  0.577046
-1.715002 -1.039268 -0.370647
'''

from helper.polars import to_dataframe
df = to_dataframe(data).insert_column(0, index)

Head and tail#

long_series = pl.Series(np.random.randn(1000))
long_series.head(5)

shape: (5,)


f64
0.033182
-0.877532
0.696501
0.685849
0.271837

long_series.tail(3)

shape: (3,)


f64
-0.510464
-0.84928
-0.308495

Attributes and underlying data#

df[:2]

shape: (2, 4)

index	a	b	c
date	f64	f64	f64
2000-01-01	-0.173215	0.119209	-1.044236
2000-01-02	-0.861849	-2.104569	-0.494929

df = df.rename(str.lower)
df

shape: (8, 4)

index	a	b	c
date	f64	f64	f64
2000-01-01	-0.173215	0.119209	-1.044236
2000-01-02	-0.861849	-2.104569	-0.494929
2000-01-03	1.071804	0.721555	-0.706771
2000-01-04	-1.039575	0.27186	-0.424972
2000-01-05	0.56702	0.276232	-1.087401
2000-01-06	-0.67369	0.113648	-1.478427
2000-01-07	0.524988	0.404705	0.577046
2000-01-08	-1.715002	-1.039268	-0.370647

s['value'].to_arrow()

<pyarrow.lib.DoubleArray object at 0x0000018C7926DC00>
[
  -1.0640128516665701,
  -0.8614483783464599,
  0.05714225040961911,
  0.9208589421796419,
  0.44817508840960785
]

s['index'].to_arrow()

<pyarrow.lib.LargeStringArray object at 0x0000018C7926DF00>
[
  "a",
  "b",
  "c",
  "d",
  "e"
]

s['value'].to_numpy()

array([-1.06401285, -0.86144838,  0.05714225,  0.92085894,  0.44817509])

np.asarray(s['value'])

array([-1.06401285, -0.86144838,  0.05714225,  0.92085894,  0.44817509])

ser = (
pl.date_range(pl.date(2000, 1, 1), pl.date(2000, 1, 2), eager=True)
  .cast(pl.Datetime)
  .dt.replace_time_zone('CET')
)
ser.to_numpy()

array(['1999-12-31T23:00:00.000000', '2000-01-01T23:00:00.000000'],
      dtype='datetime64[us]')

# df.to_numpy()
df.select(pl.exclude('index')).to_numpy()

array([[-0.173215,  0.119209, -1.044236],
       [-0.861849, -2.104569, -0.494929],
       [ 1.071804,  0.721555, -0.706771],
       [-1.039575,  0.27186 , -0.424972],
       [ 0.56702 ,  0.276232, -1.087401],
       [-0.67369 ,  0.113648, -1.478427],
       [ 0.524988,  0.404705,  0.577046],
       [-1.715002, -1.039268, -0.370647]])

Accelerated operations#

Flexible binary operations#

data = '''
index   one       two     three
a  1.394981  1.772517       NaN
b  0.343054  1.912123 -0.050390
c  0.695246  1.478369  1.227435
d       NaN  0.279344 -0.613172
'''
df = to_dataframe(data)
df

shape: (4, 4)

index	one	two	three
str	f64	f64	f64
"a"	1.394981	1.772517	NaN
"b"	0.343054	1.912123	-0.05039
"c"	0.695246	1.478369	1.227435
"d"	NaN	0.279344	-0.613172

row_data = df.row(1, named=True)
column = df.select('index', value='two')
print(row_data)
print(column)

{'index': 'b', 'one': 0.343054, 'two': 1.912123, 'three': -0.05039}
shape: (4, 2)
┌───────┬──────────┐
│ index ┆ value    │
│ ---   ┆ ---      │
│ str   ┆ f64      │
╞═══════╪══════════╡
│ a     ┆ 1.772517 │
│ b     ┆ 1.912123 │
│ c     ┆ 1.478369 │
│ d     ┆ 0.279344 │
└───────┴──────────┘

# df.sub(row, axis="columns")
df.select(
    'index',
    *[pl.col(name) - val for name, val in row_data.items() if name != "index"]
)

shape: (4, 4)

index	one	two	three
str	f64	f64	f64
"a"	1.051927	-0.139606	NaN
"b"	0.0	0.0	0.0
"c"	0.352192	-0.433754	1.277825
"d"	NaN	-1.632779	-0.562782

# df.sub(column, axis="index")
df.select(
    'index',
    pl.exclude('index') - column['value']
)

shape: (4, 4)

index	one	two	three
str	f64	f64	f64
"a"	-0.377536	0.0	NaN
"b"	-1.569069	0.0	-1.962513
"c"	-0.783123	0.0	-0.250934
"d"	NaN	0.0	-0.892516

mi = pl.DataFrame(
    [(1, "a"), (1, "b"), (1, "c"), (2, "a")], 
    schema=["first", "second"],
    orient="row"
)
dfmi = pl.concat([mi, df.select(pl.exclude('index'))], how='horizontal')
dfmi

shape: (4, 5)

first	second	one	two	three
i64	str	f64	f64	f64
1	"a"	1.394981	1.772517	NaN
1	"b"	0.343054	1.912123	-0.05039
1	"c"	0.695246	1.478369	1.227435
2	"a"	NaN	0.279344	-0.613172

# dfmi.sub(column, axis=0, level="second")
new_column = column.join(dfmi.select('second'), left_on='index', right_on='second')
dfmi.select(
    pl.col('first', 'second'),
    pl.exclude('first', 'second') - new_column['value']
)

shape: (4, 5)

first	second	one	two	three
i64	str	f64	f64	f64
1	"a"	-0.377536	0.0	NaN
1	"b"	-1.569069	0.0	-1.962513
1	"c"	-0.783123	0.0	-0.250934
2	"a"	NaN	-1.493173	-2.385689

s = pl.Series(np.arange(10))
div, rem = s // 3, s % 3
row(s, div, rem)

shape: (10,)


i32
0
1
2
3
4
5
6
7
8
9

shape: (10,)


i32
0
0
0
1
1
1
2
2
2
3

shape: (10,)


i32
0
1
2
0
1
2
0
1
2
0

Missing data / operations with fill values#

# df2.loc["a", "three"] = 1.0
df2 = df.with_columns(
    pl.when(pl.col('index') == 'a')
      .then(1.0)
      .otherwise('three')
      .name.keep()
)
row(df, df2)

shape: (4, 4)

index	one	two	three
str	f64	f64	f64
"a"	1.394981	1.772517	NaN
"b"	0.343054	1.912123	-0.05039
"c"	0.695246	1.478369	1.227435
"d"	NaN	0.279344	-0.613172

shape: (4, 4)

index	one	two	three
str	f64	f64	f64
"a"	1.394981	1.772517	1.0
"b"	0.343054	1.912123	-0.05039
"c"	0.695246	1.478369	1.227435
"d"	NaN	0.279344	-0.613172

# df2.loc["a", "three"] = 1.0
df2 = df.update(pl.select(index=pl.lit('a'), three=1), on='index')

# df + df2
from helper.polars import align_op
align_op(df, df2, on='index', op=pl.Expr.add)

shape: (4, 4)

index	one	two	three
str	f64	f64	f64
"a"	2.789962	3.545034	NaN
"b"	0.686108	3.824246	-0.10078
"c"	1.390492	2.956738	2.45487
"d"	NaN	0.558688	-1.226344

Flexible comparisons#

# df.gt(df2)
align_op(df, df2, on='index', op=pl.Expr.gt)

shape: (4, 4)

index	one	two	three
str	bool	bool	bool
"a"	false	false	true
"b"	false	false	false
"c"	false	false	false
"d"	false	false	false

# df2.ne(df)
align_op(df2, df, on='index', op=pl.Expr.ne)

shape: (4, 4)

index	one	two	three
str	bool	bool	bool
"a"	false	false	true
"b"	false	false	false
"c"	false	false	false
"d"	false	false	false

Boolean reductions#

# (df > 0).all()
df.select((pl.exclude('index') > 0).all())

shape: (1, 3)

one	two	three
bool	bool	bool
true	true	false

# (df > 0).any()
df.select((pl.exclude('index') > 0).any())

shape: (1, 3)

one	two	three
bool	bool	bool
true	true	true

# (df > 0).any().any()
df.select(pl.any_horizontal(pl.exclude('index') > 0).any()).item()

True

df.is_empty()

False

pl.DataFrame(schema=list('ABC')).is_empty()

True

Comparing if objects are equivalent#

# df + df == df * 2
align_op(
    align_op(df, df, op=pl.Expr.add),
    df.select(
        'index',
        pl.exclude('index') * 2
    ),
    op=pl.Expr.eq
)

shape: (4, 4)

index	one	two	three
str	bool	bool	bool
"a"	true	true	true
"b"	true	true	true
"c"	true	true	true
"d"	true	true	true

In polars, NaN is equal to NaN

pl.Series([np.nan]) == pl.Series([np.nan])

shape: (1,)


bool
true

# (df + df).equals(df * 2)
align_op(df, df, op=pl.Expr.add).equals(
    df.select(
        'index',
        pl.exclude('index') * 2
    ),
)

True

df1 = pl.DataFrame({"index":[0, 1, 2], "col": [1.0, 0, np.nan]})
df2 = pl.DataFrame({"index":[2, 1, 0], "col": [np.nan, 0, 1.0]})
df1.equals(df2)

False

df1.equals(df2.sort('index'))

True

Comparing array-like objects#

pl.Series(["foo", "bar", "baz"]) == "foo"

shape: (3,)


bool
true
false
false

row(
pl.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"]),    
pl.Series(["foo", "bar", "baz"]) == pl.Series(np.array(["foo", "bar", "qux"]))
)

shape: (3,)


bool
false
false
false

shape: (3,)


bool
true
true
false

Combining overlapping data sets#

df1 = pl.DataFrame({
    "A": [1.0, np.nan, 3.0, 5.0, np.nan], 
    "B": [np.nan, 2.0, 3.0, np.nan, 6.0]
})

df2 = pl.DataFrame({
    "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
    "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
})

row(df1, df2)

shape: (5, 2)

A	B
f64	f64
1.0	NaN
NaN	2.0
3.0	3.0
5.0	NaN
NaN	6.0

shape: (6, 2)

A	B
f64	f64
5.0	NaN
2.0	NaN
4.0	3.0
NaN	4.0
3.0	6.0
7.0	8.0

# df1.combine_first(df2)
(
df1
.with_row_index()
.join(
    df2.with_row_index(), on='index', how='left'
)
.fill_nan(None)
.select(
    [pl.coalesce(pl.col(c), pl.col(f'{c}_right')) for c in df1.columns]
)
)

shape: (5, 2)

A	B
f64	f64
1.0	null
2.0	2.0
3.0	3.0
5.0	4.0
3.0	6.0

General DataFrame combine#

# df1.combine(df2, combiner)

def combiner(x, y):
    return pl.when(x.fill_nan(None).is_null()).then(y).otherwise(x)
     
align_op(df1.with_row_index(), df2.with_row_index(), combiner, how='full', fill_value=None)

shape: (6, 3)

index	A	B
u32	f64	f64
0	1.0	NaN
1	2.0	2.0
2	3.0	3.0
3	5.0	4.0
4	3.0	6.0
5	7.0	8.0

Descriptive statistics#

df

shape: (4, 4)

index	one	two	three
str	f64	f64	f64
"a"	1.394981	1.772517	NaN
"b"	0.343054	1.912123	-0.05039
"c"	0.695246	1.478369	1.227435
"d"	NaN	0.279344	-0.613172

# df.mean(0)
df.select(pl.exclude('index').fill_nan(None).mean())

shape: (1, 3)

one	two	three
f64	f64	f64
0.811094	1.360588	0.187958

# df.mean(1)
df.select(
    'index',
    pl.mean_horizontal(pl.exclude('index').fill_nan(None))
)

shape: (4, 2)

index	one
str	f64
"a"	1.583749
"b"	0.734929
"c"	1.133683
"d"	-0.166914

# df.sum(0, skipna=False)
df.select(pl.exclude('index').sum())

shape: (1, 3)

one	two	three
f64	f64	f64
NaN	5.442353	NaN

# df.sum(axis=1, skipna=True)
df.select(
    'index',
    pl.sum_horizontal(pl.exclude('index').fill_nan(None))
)

shape: (4, 2)

index	one
str	f64
"a"	3.167498
"b"	2.204787
"c"	3.40105
"d"	-0.333828

cols = pl.exclude('index').fill_nan(None)
ts_stand = df.select(
    (cols - cols.mean()) / cols.std()
)
ts_stand.std()

shape: (1, 3)

one	two	three
f64	f64	f64
1.0	1.0	1.0

# xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)
cols = pl.exclude('index').fill_nan(None)
cols_list = pl.concat_list(cols).list
xs_stand = df.select(
    'index',
    (cols - cols_list.mean()) / cols_list.std()
)
xs_stand.select(
    'index',
    cols_list.std()
)

shape: (4, 2)

index	one
str	f64
"a"	1.0
"b"	1.0
"c"	1.0
"d"	1.0

# df.cumsum()
cols = pl.exclude('index').fill_nan(None)
df.select(
    'index',
    cols.cum_sum()
)

shape: (4, 4)

index	one	two	three
str	f64	f64	f64
"a"	1.394981	1.772517	null
"b"	1.738035	3.68464	-0.05039
"c"	2.433281	5.163009	1.177045
"d"	null	5.442353	0.563873

np.mean(df['one'].to_numpy())

nan

series = pl.Series(np.random.randn(500))
series[np.arange(20, 500)] = np.nan
series[np.arange(10, 20)] = 5
series.n_unique()

series.drop_nans().n_unique()

Summarizing data: describe#

series = pl.Series(np.random.randn(1000))
series[np.arange(0, len(series), 2)] = None
series.describe()

shape: (9, 2)

statistic	value
str	f64
"count"	500.0
"null_count"	500.0
"mean"	0.031273
"std"	1.005434
"min"	-3.13751
"25%"	-0.626407
"50%"	0.057798
"75%"	0.70786
"max"	2.828656

frame = pl.DataFrame(np.random.randn(1000, 5), schema=["a", "b", "c", "d", "e"])

# frame.iloc[::2] = np.nan

frame = frame.select(
    pl.when(pl.int_range(0, pl.len()) % 2 == 0)
    .then(None)
    .otherwise(pl.all())
    .name.keep()
)

frame.describe()

shape: (9, 6)

statistic	a	b	c	d	e
str	f64	f64	f64	f64	f64
"count"	500.0	500.0	500.0	500.0	500.0
"null_count"	500.0	500.0	500.0	500.0	500.0
"mean"	-0.009165	0.062481	0.038552	-0.028247	-0.049076
"std"	0.964467	1.007907	1.040596	0.981704	1.02394
"min"	-3.296775	-3.199059	-3.118371	-2.742383	-2.794847
"25%"	-0.672506	-0.58794	-0.618475	-0.714627	-0.755011
"50%"	-0.046801	0.078398	-0.028415	-0.062934	-0.046535
"75%"	0.645693	0.704563	0.756428	0.627254	0.649599
"max"	2.668789	2.729561	2.750978	3.768601	3.231683

series.describe(percentiles=[0.05, 0.25, 0.75, 0.95])

shape: (10, 2)

statistic	value
str	f64
"count"	500.0
"null_count"	500.0
"mean"	0.031273
"std"	1.005434
"min"	-3.13751
"5%"	-1.617022
"25%"	-0.626407
"75%"	0.70786
"95%"	1.621126
"max"	2.828656

s = pl.Series(["a", "a", "b", "b", "a", "a", None, "c", "d", "a"])
s.describe()

shape: (4, 2)

statistic	value
str	str
"count"	"9"
"null_count"	"1"
"min"	"a"
"max"	"d"

frame = pl.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)})
frame.describe()

shape: (9, 3)

statistic	a	b
str	str	f64
"count"	"4"	4.0
"null_count"	"0"	0.0
"mean"	null	1.5
"std"	null	1.290994
"min"	"No"	0.0
"25%"	null	1.0
"50%"	null	2.0
"75%"	null	2.0
"max"	"Yes"	3.0

Index of min/max values#

s1 = pl.Series(np.random.randn(5))
s1

shape: (5,)


f64
0.642147
1.132986
0.536466
0.2154
0.539621

# s1.idxmin(), s1.idxmax()
s1.arg_min(), s1.arg_max()

(3, 1)

df1 = pl.DataFrame(np.random.randn(5, 3), schema=["A", "B", "C"])
df1

shape: (5, 3)

A	B	C
f64	f64	f64
-0.331573	0.554591	-0.581225
-0.154205	1.388221	-0.085109
0.993925	-1.25826	-1.204862
-0.268844	-0.079973	-1.734103
2.140542	0.792179	-1.549576

# df1.idxmin(axis=0)
df1.select(pl.all().arg_min())

shape: (1, 3)

A	B	C
u32	u32	u32
0	2	3

# df1.idxmax(axis=1)
df1.select(
    arg_max=pl.lit(pl.Series(df1.columns)).get(
        pl.concat_list(pl.all()).list.arg_max()
    )
)

shape: (5, 1)

arg_max
str
"B"
"B"
"A"
"B"
"A"

df3 = pl.DataFrame(
    [['e', 'd', 'c', 'b', 'a'],
    [2, 1, 1, 3, None]], schema=["index", "A"]
)
df3

shape: (5, 2)

index	A
str	i64
"e"	2
"d"	1
"c"	1
"b"	3
"a"	null

# df3["A"].idxmin()
df3.select(
    pl.col('index').get(pl.col('A').arg_max())
).item()

'b'

Value counts (histogramming) / mode#

data = np.array(
    [6, 6, 2, 3, 5, 3, 2, 5, 4, 5, 4, 3, 
     4, 5, 0, 2, 0, 4, 2, 0, 3, 2, 2, 5, 
     6, 5, 3, 4, 6, 4, 3, 5, 6, 4, 3, 6, 
     2, 6, 6, 2, 3, 4, 2, 1, 6, 2, 6, 1, 5, 4])
s = pl.Series('value', data)
s.value_counts()

shape: (7, 2)

value	count
i32	u32
5	8
2	10
6	10
4	9
0	3
1	2
3	8

# frame.value_counts()
data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}
frame = pl.DataFrame(data)
frame.select(
    pl.struct(pl.all()).value_counts().struct.unnest()
)

shape: (4, 2)

a	count
struct[2]	u32
{1,"x"}	1
{3,"y"}	1
{4,"y"}	1
{2,"x"}	1

s5 = pl.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])
s5.mode()

shape: (2,)


i64
7
3

# df5.mode()
df5 = pl.DataFrame(
    {
        "A": np.random.randint(0, 7, size=50),
        "B": np.random.randint(-10, 15, size=50),
    }
)
df5.select(pl.all().mode().implode())

shape: (1, 2)

A	B
list[i32]	list[i32]
[4]	[13, -2]

Essential basic functionality

Contents

Essential basic functionality#

Head and tail#

Attributes and underlying data#

Accelerated operations#

Flexible binary operations#

Missing data / operations with fill values#

Flexible comparisons#

Boolean reductions#

Comparing if objects are equivalent#

Comparing array-like objects#

Combining overlapping data sets#

General DataFrame combine#

Descriptive statistics#

Summarizing data: describe#

Index of min/max values#

Value counts (histogramming) / mode#

Discretization and quantiling#