Essential basic functionality#

import polars as pl
import numpy as np
from helper.jupyter import row
index = pl.date_range(pl.date(2000, 1, 1), pl.date(2000, 1, 8), eager=True).alias('index')
s = pl.DataFrame(dict(
    index=['a', 'b', 'c', 'd', 'e'],
    value=np.random.randn(5)
))

df = pl.DataFrame(np.random.randn(8, 3), schema=['A', 'B', 'C']).insert_column(0, index)

data = '''
       a         b         c
-0.173215  0.119209 -1.044236
-0.861849 -2.104569 -0.494929
+1.071804  0.721555 -0.706771
-1.039575  0.271860 -0.424972
+0.567020  0.276232 -1.087401
-0.673690  0.113648 -1.478427
+0.524988  0.404705  0.577046
-1.715002 -1.039268 -0.370647
'''

from helper.polars import to_dataframe
df = to_dataframe(data).insert_column(0, index)

Head and tail#

long_series = pl.Series(np.random.randn(1000))
long_series.head(5)
shape: (5,)
f64
0.033182
-0.877532
0.696501
0.685849
0.271837
long_series.tail(3)
shape: (3,)
f64
-0.510464
-0.84928
-0.308495

Attributes and underlying data#

df[:2]
shape: (2, 4)
indexabc
datef64f64f64
2000-01-01-0.1732150.119209-1.044236
2000-01-02-0.861849-2.104569-0.494929
df = df.rename(str.lower)
df
shape: (8, 4)
indexabc
datef64f64f64
2000-01-01-0.1732150.119209-1.044236
2000-01-02-0.861849-2.104569-0.494929
2000-01-031.0718040.721555-0.706771
2000-01-04-1.0395750.27186-0.424972
2000-01-050.567020.276232-1.087401
2000-01-06-0.673690.113648-1.478427
2000-01-070.5249880.4047050.577046
2000-01-08-1.715002-1.039268-0.370647
s['value'].to_arrow()
<pyarrow.lib.DoubleArray object at 0x0000018C7926DC00>
[
  -1.0640128516665701,
  -0.8614483783464599,
  0.05714225040961911,
  0.9208589421796419,
  0.44817508840960785
]
s['index'].to_arrow()
<pyarrow.lib.LargeStringArray object at 0x0000018C7926DF00>
[
  "a",
  "b",
  "c",
  "d",
  "e"
]
s['value'].to_numpy()
array([-1.06401285, -0.86144838,  0.05714225,  0.92085894,  0.44817509])
np.asarray(s['value'])
array([-1.06401285, -0.86144838,  0.05714225,  0.92085894,  0.44817509])
ser = (
pl.date_range(pl.date(2000, 1, 1), pl.date(2000, 1, 2), eager=True)
  .cast(pl.Datetime)
  .dt.replace_time_zone('CET')
)
ser.to_numpy()
array(['1999-12-31T23:00:00.000000', '2000-01-01T23:00:00.000000'],
      dtype='datetime64[us]')
# df.to_numpy()
df.select(pl.exclude('index')).to_numpy()
array([[-0.173215,  0.119209, -1.044236],
       [-0.861849, -2.104569, -0.494929],
       [ 1.071804,  0.721555, -0.706771],
       [-1.039575,  0.27186 , -0.424972],
       [ 0.56702 ,  0.276232, -1.087401],
       [-0.67369 ,  0.113648, -1.478427],
       [ 0.524988,  0.404705,  0.577046],
       [-1.715002, -1.039268, -0.370647]])

Accelerated operations#

Flexible binary operations#

data = '''
index   one       two     three
a  1.394981  1.772517       NaN
b  0.343054  1.912123 -0.050390
c  0.695246  1.478369  1.227435
d       NaN  0.279344 -0.613172
'''
df = to_dataframe(data)
df
shape: (4, 4)
indexonetwothree
strf64f64f64
"a"1.3949811.772517NaN
"b"0.3430541.912123-0.05039
"c"0.6952461.4783691.227435
"d"NaN0.279344-0.613172
row_data = df.row(1, named=True)
column = df.select('index', value='two')
print(row_data)
print(column)
{'index': 'b', 'one': 0.343054, 'two': 1.912123, 'three': -0.05039}
shape: (4, 2)
┌───────┬──────────┐
│ index ┆ value    │
│ ---   ┆ ---      │
│ str   ┆ f64      │
╞═══════╪══════════╡
│ a     ┆ 1.772517 │
│ b     ┆ 1.912123 │
│ c     ┆ 1.478369 │
│ d     ┆ 0.279344 │
└───────┴──────────┘
# df.sub(row, axis="columns")
df.select(
    'index',
    *[pl.col(name) - val for name, val in row_data.items() if name != "index"]
)
shape: (4, 4)
indexonetwothree
strf64f64f64
"a"1.051927-0.139606NaN
"b"0.00.00.0
"c"0.352192-0.4337541.277825
"d"NaN-1.632779-0.562782
# df.sub(column, axis="index")
df.select(
    'index',
    pl.exclude('index') - column['value']
)
shape: (4, 4)
indexonetwothree
strf64f64f64
"a"-0.3775360.0NaN
"b"-1.5690690.0-1.962513
"c"-0.7831230.0-0.250934
"d"NaN0.0-0.892516
mi = pl.DataFrame(
    [(1, "a"), (1, "b"), (1, "c"), (2, "a")], 
    schema=["first", "second"],
    orient="row"
)
dfmi = pl.concat([mi, df.select(pl.exclude('index'))], how='horizontal')
dfmi
shape: (4, 5)
firstsecondonetwothree
i64strf64f64f64
1"a"1.3949811.772517NaN
1"b"0.3430541.912123-0.05039
1"c"0.6952461.4783691.227435
2"a"NaN0.279344-0.613172
# dfmi.sub(column, axis=0, level="second")
new_column = column.join(dfmi.select('second'), left_on='index', right_on='second')
dfmi.select(
    pl.col('first', 'second'),
    pl.exclude('first', 'second') - new_column['value']
)
shape: (4, 5)
firstsecondonetwothree
i64strf64f64f64
1"a"-0.3775360.0NaN
1"b"-1.5690690.0-1.962513
1"c"-0.7831230.0-0.250934
2"a"NaN-1.493173-2.385689
s = pl.Series(np.arange(10))
div, rem = s // 3, s % 3
row(s, div, rem)
shape: (10,)
i32
0
1
2
3
4
5
6
7
8
9
shape: (10,)
i32
0
0
0
1
1
1
2
2
2
3
shape: (10,)
i32
0
1
2
0
1
2
0
1
2
0

Missing data / operations with fill values#

# df2.loc["a", "three"] = 1.0
df2 = df.with_columns(
    pl.when(pl.col('index') == 'a')
      .then(1.0)
      .otherwise('three')
      .name.keep()
)
row(df, df2)
shape: (4, 4)
indexonetwothree
strf64f64f64
"a"1.3949811.772517NaN
"b"0.3430541.912123-0.05039
"c"0.6952461.4783691.227435
"d"NaN0.279344-0.613172
shape: (4, 4)
indexonetwothree
strf64f64f64
"a"1.3949811.7725171.0
"b"0.3430541.912123-0.05039
"c"0.6952461.4783691.227435
"d"NaN0.279344-0.613172
# df2.loc["a", "three"] = 1.0
df2 = df.update(pl.select(index=pl.lit('a'), three=1), on='index')
# df + df2
from helper.polars import align_op
align_op(df, df2, on='index', op=pl.Expr.add)
shape: (4, 4)
indexonetwothree
strf64f64f64
"a"2.7899623.545034NaN
"b"0.6861083.824246-0.10078
"c"1.3904922.9567382.45487
"d"NaN0.558688-1.226344

Flexible comparisons#

# df.gt(df2)
align_op(df, df2, on='index', op=pl.Expr.gt)
shape: (4, 4)
indexonetwothree
strboolboolbool
"a"falsefalsetrue
"b"falsefalsefalse
"c"falsefalsefalse
"d"falsefalsefalse
# df2.ne(df)
align_op(df2, df, on='index', op=pl.Expr.ne)
shape: (4, 4)
indexonetwothree
strboolboolbool
"a"falsefalsetrue
"b"falsefalsefalse
"c"falsefalsefalse
"d"falsefalsefalse

Boolean reductions#

# (df > 0).all()
df.select((pl.exclude('index') > 0).all())
shape: (1, 3)
onetwothree
boolboolbool
truetruefalse
# (df > 0).any()
df.select((pl.exclude('index') > 0).any())
shape: (1, 3)
onetwothree
boolboolbool
truetruetrue
# (df > 0).any().any()
df.select(pl.any_horizontal(pl.exclude('index') > 0).any()).item()
True
df.is_empty()
False
pl.DataFrame(schema=list('ABC')).is_empty()
True

Comparing if objects are equivalent#

# df + df == df * 2
align_op(
    align_op(df, df, op=pl.Expr.add),
    df.select(
        'index',
        pl.exclude('index') * 2
    ),
    op=pl.Expr.eq
)
shape: (4, 4)
indexonetwothree
strboolboolbool
"a"truetruetrue
"b"truetruetrue
"c"truetruetrue
"d"truetruetrue

In polars, NaN is equal to NaN

pl.Series([np.nan]) == pl.Series([np.nan])
shape: (1,)
bool
true
# (df + df).equals(df * 2)
align_op(df, df, op=pl.Expr.add).equals(
    df.select(
        'index',
        pl.exclude('index') * 2
    ),
)
True
df1 = pl.DataFrame({"index":[0, 1, 2], "col": [1.0, 0, np.nan]})
df2 = pl.DataFrame({"index":[2, 1, 0], "col": [np.nan, 0, 1.0]})
df1.equals(df2)
False
df1.equals(df2.sort('index'))
True

Comparing array-like objects#

pl.Series(["foo", "bar", "baz"]) == "foo"
shape: (3,)
bool
true
false
false
row(
pl.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"]),    
pl.Series(["foo", "bar", "baz"]) == pl.Series(np.array(["foo", "bar", "qux"]))
)
shape: (3,)
bool
false
false
false
shape: (3,)
bool
true
true
false

Combining overlapping data sets#

df1 = pl.DataFrame({
    "A": [1.0, np.nan, 3.0, 5.0, np.nan], 
    "B": [np.nan, 2.0, 3.0, np.nan, 6.0]
})

df2 = pl.DataFrame({
    "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
    "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
})

row(df1, df2)
shape: (5, 2)
AB
f64f64
1.0NaN
NaN2.0
3.03.0
5.0NaN
NaN6.0
shape: (6, 2)
AB
f64f64
5.0NaN
2.0NaN
4.03.0
NaN4.0
3.06.0
7.08.0
# df1.combine_first(df2)
(
df1
.with_row_index()
.join(
    df2.with_row_index(), on='index', how='left'
)
.fill_nan(None)
.select(
    [pl.coalesce(pl.col(c), pl.col(f'{c}_right')) for c in df1.columns]
)
)
shape: (5, 2)
AB
f64f64
1.0null
2.02.0
3.03.0
5.04.0
3.06.0

General DataFrame combine#

# df1.combine(df2, combiner)

def combiner(x, y):
    return pl.when(x.fill_nan(None).is_null()).then(y).otherwise(x)
     
align_op(df1.with_row_index(), df2.with_row_index(), combiner, how='full', fill_value=None)
shape: (6, 3)
indexAB
u32f64f64
01.0NaN
12.02.0
23.03.0
35.04.0
43.06.0
57.08.0

Descriptive statistics#

df
shape: (4, 4)
indexonetwothree
strf64f64f64
"a"1.3949811.772517NaN
"b"0.3430541.912123-0.05039
"c"0.6952461.4783691.227435
"d"NaN0.279344-0.613172
# df.mean(0)
df.select(pl.exclude('index').fill_nan(None).mean())
shape: (1, 3)
onetwothree
f64f64f64
0.8110941.3605880.187958
# df.mean(1)
df.select(
    'index',
    pl.mean_horizontal(pl.exclude('index').fill_nan(None))
)
shape: (4, 2)
indexone
strf64
"a"1.583749
"b"0.734929
"c"1.133683
"d"-0.166914
# df.sum(0, skipna=False)
df.select(pl.exclude('index').sum())
shape: (1, 3)
onetwothree
f64f64f64
NaN5.442353NaN
# df.sum(axis=1, skipna=True)
df.select(
    'index',
    pl.sum_horizontal(pl.exclude('index').fill_nan(None))
)
shape: (4, 2)
indexone
strf64
"a"3.167498
"b"2.204787
"c"3.40105
"d"-0.333828
cols = pl.exclude('index').fill_nan(None)
ts_stand = df.select(
    (cols - cols.mean()) / cols.std()
)
ts_stand.std()
shape: (1, 3)
onetwothree
f64f64f64
1.01.01.0
# xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)
cols = pl.exclude('index').fill_nan(None)
cols_list = pl.concat_list(cols).list
xs_stand = df.select(
    'index',
    (cols - cols_list.mean()) / cols_list.std()
)
xs_stand.select(
    'index',
    cols_list.std()
)
shape: (4, 2)
indexone
strf64
"a"1.0
"b"1.0
"c"1.0
"d"1.0
# df.cumsum()
cols = pl.exclude('index').fill_nan(None)
df.select(
    'index',
    cols.cum_sum()
)
shape: (4, 4)
indexonetwothree
strf64f64f64
"a"1.3949811.772517null
"b"1.7380353.68464-0.05039
"c"2.4332815.1630091.177045
"d"null5.4423530.563873
np.mean(df['one'].to_numpy())
nan
series = pl.Series(np.random.randn(500))
series[np.arange(20, 500)] = np.nan
series[np.arange(10, 20)] = 5
series.n_unique()
12
series.drop_nans().n_unique()
11

Summarizing data: describe#

series = pl.Series(np.random.randn(1000))
series[np.arange(0, len(series), 2)] = None
series.describe()
shape: (9, 2)
statisticvalue
strf64
"count"500.0
"null_count"500.0
"mean"0.031273
"std"1.005434
"min"-3.13751
"25%"-0.626407
"50%"0.057798
"75%"0.70786
"max"2.828656
frame = pl.DataFrame(np.random.randn(1000, 5), schema=["a", "b", "c", "d", "e"])
# frame.iloc[::2] = np.nan

frame = frame.select(
    pl.when(pl.int_range(0, pl.len()) % 2 == 0)
    .then(None)
    .otherwise(pl.all())
    .name.keep()
)
frame.describe()
shape: (9, 6)
statisticabcde
strf64f64f64f64f64
"count"500.0500.0500.0500.0500.0
"null_count"500.0500.0500.0500.0500.0
"mean"-0.0091650.0624810.038552-0.028247-0.049076
"std"0.9644671.0079071.0405960.9817041.02394
"min"-3.296775-3.199059-3.118371-2.742383-2.794847
"25%"-0.672506-0.58794-0.618475-0.714627-0.755011
"50%"-0.0468010.078398-0.028415-0.062934-0.046535
"75%"0.6456930.7045630.7564280.6272540.649599
"max"2.6687892.7295612.7509783.7686013.231683
series.describe(percentiles=[0.05, 0.25, 0.75, 0.95])
shape: (10, 2)
statisticvalue
strf64
"count"500.0
"null_count"500.0
"mean"0.031273
"std"1.005434
"min"-3.13751
"5%"-1.617022
"25%"-0.626407
"75%"0.70786
"95%"1.621126
"max"2.828656
s = pl.Series(["a", "a", "b", "b", "a", "a", None, "c", "d", "a"])
s.describe()
shape: (4, 2)
statisticvalue
strstr
"count""9"
"null_count""1"
"min""a"
"max""d"
frame = pl.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)})
frame.describe()
shape: (9, 3)
statisticab
strstrf64
"count""4"4.0
"null_count""0"0.0
"mean"null1.5
"std"null1.290994
"min""No"0.0
"25%"null1.0
"50%"null2.0
"75%"null2.0
"max""Yes"3.0

Index of min/max values#

s1 = pl.Series(np.random.randn(5))
s1
shape: (5,)
f64
0.642147
1.132986
0.536466
0.2154
0.539621
# s1.idxmin(), s1.idxmax()
s1.arg_min(), s1.arg_max()
(3, 1)
df1 = pl.DataFrame(np.random.randn(5, 3), schema=["A", "B", "C"])
df1
shape: (5, 3)
ABC
f64f64f64
-0.3315730.554591-0.581225
-0.1542051.388221-0.085109
0.993925-1.25826-1.204862
-0.268844-0.079973-1.734103
2.1405420.792179-1.549576
# df1.idxmin(axis=0)
df1.select(pl.all().arg_min())
shape: (1, 3)
ABC
u32u32u32
023
# df1.idxmax(axis=1)
df1.select(
    arg_max=pl.lit(pl.Series(df1.columns)).get(
        pl.concat_list(pl.all()).list.arg_max()
    )
)
shape: (5, 1)
arg_max
str
"B"
"B"
"A"
"B"
"A"
df3 = pl.DataFrame(
    [['e', 'd', 'c', 'b', 'a'],
    [2, 1, 1, 3, None]], schema=["index", "A"]
)
df3
shape: (5, 2)
indexA
stri64
"e"2
"d"1
"c"1
"b"3
"a"null
# df3["A"].idxmin()
df3.select(
    pl.col('index').get(pl.col('A').arg_max())
).item()
'b'

Value counts (histogramming) / mode#

data = np.array(
    [6, 6, 2, 3, 5, 3, 2, 5, 4, 5, 4, 3, 
     4, 5, 0, 2, 0, 4, 2, 0, 3, 2, 2, 5, 
     6, 5, 3, 4, 6, 4, 3, 5, 6, 4, 3, 6, 
     2, 6, 6, 2, 3, 4, 2, 1, 6, 2, 6, 1, 5, 4])
s = pl.Series('value', data)
s.value_counts()
shape: (7, 2)
valuecount
i32u32
58
210
610
49
03
12
38
# frame.value_counts()
data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}
frame = pl.DataFrame(data)
frame.select(
    pl.struct(pl.all()).value_counts().struct.unnest()
)
shape: (4, 2)
acount
struct[2]u32
{1,"x"}1
{3,"y"}1
{4,"y"}1
{2,"x"}1
s5 = pl.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])
s5.mode()
shape: (2,)
i64
7
3
# df5.mode()
df5 = pl.DataFrame(
    {
        "A": np.random.randint(0, 7, size=50),
        "B": np.random.randint(-10, 15, size=50),
    }
)
df5.select(pl.all().mode().implode())
shape: (1, 2)
AB
list[i32]list[i32]
[4][13, -2]

Discretization and quantiling#