Essential basic functionality#
import polars as pl
import numpy as np
from helper.jupyter import row
index = pl.date_range(pl.date(2000, 1, 1), pl.date(2000, 1, 8), eager=True).alias('index')
s = pl.DataFrame(dict(
index=['a', 'b', 'c', 'd', 'e'],
value=np.random.randn(5)
))
df = pl.DataFrame(np.random.randn(8, 3), schema=['A', 'B', 'C']).insert_column(0, index)
data = '''
a b c
-0.173215 0.119209 -1.044236
-0.861849 -2.104569 -0.494929
+1.071804 0.721555 -0.706771
-1.039575 0.271860 -0.424972
+0.567020 0.276232 -1.087401
-0.673690 0.113648 -1.478427
+0.524988 0.404705 0.577046
-1.715002 -1.039268 -0.370647
'''
from helper.polars import to_dataframe
df = to_dataframe(data).insert_column(0, index)
Head and tail#
long_series = pl.Series(np.random.randn(1000))
long_series.head(5)
shape: (5,)
f64 |
0.033182 |
-0.877532 |
0.696501 |
0.685849 |
0.271837 |
long_series.tail(3)
shape: (3,)
f64 |
-0.510464 |
-0.84928 |
-0.308495 |
Attributes and underlying data#
df[:2]
shape: (2, 4)
index | a | b | c |
---|---|---|---|
date | f64 | f64 | f64 |
2000-01-01 | -0.173215 | 0.119209 | -1.044236 |
2000-01-02 | -0.861849 | -2.104569 | -0.494929 |
df = df.rename(str.lower)
df
shape: (8, 4)
index | a | b | c |
---|---|---|---|
date | f64 | f64 | f64 |
2000-01-01 | -0.173215 | 0.119209 | -1.044236 |
2000-01-02 | -0.861849 | -2.104569 | -0.494929 |
2000-01-03 | 1.071804 | 0.721555 | -0.706771 |
2000-01-04 | -1.039575 | 0.27186 | -0.424972 |
2000-01-05 | 0.56702 | 0.276232 | -1.087401 |
2000-01-06 | -0.67369 | 0.113648 | -1.478427 |
2000-01-07 | 0.524988 | 0.404705 | 0.577046 |
2000-01-08 | -1.715002 | -1.039268 | -0.370647 |
s['value'].to_arrow()
<pyarrow.lib.DoubleArray object at 0x0000018C7926DC00>
[
-1.0640128516665701,
-0.8614483783464599,
0.05714225040961911,
0.9208589421796419,
0.44817508840960785
]
s['index'].to_arrow()
<pyarrow.lib.LargeStringArray object at 0x0000018C7926DF00>
[
"a",
"b",
"c",
"d",
"e"
]
s['value'].to_numpy()
array([-1.06401285, -0.86144838, 0.05714225, 0.92085894, 0.44817509])
np.asarray(s['value'])
array([-1.06401285, -0.86144838, 0.05714225, 0.92085894, 0.44817509])
ser = (
pl.date_range(pl.date(2000, 1, 1), pl.date(2000, 1, 2), eager=True)
.cast(pl.Datetime)
.dt.replace_time_zone('CET')
)
ser.to_numpy()
array(['1999-12-31T23:00:00.000000', '2000-01-01T23:00:00.000000'],
dtype='datetime64[us]')
# df.to_numpy()
df.select(pl.exclude('index')).to_numpy()
array([[-0.173215, 0.119209, -1.044236],
[-0.861849, -2.104569, -0.494929],
[ 1.071804, 0.721555, -0.706771],
[-1.039575, 0.27186 , -0.424972],
[ 0.56702 , 0.276232, -1.087401],
[-0.67369 , 0.113648, -1.478427],
[ 0.524988, 0.404705, 0.577046],
[-1.715002, -1.039268, -0.370647]])
Accelerated operations#
Flexible binary operations#
data = '''
index one two three
a 1.394981 1.772517 NaN
b 0.343054 1.912123 -0.050390
c 0.695246 1.478369 1.227435
d NaN 0.279344 -0.613172
'''
df = to_dataframe(data)
df
shape: (4, 4)
index | one | two | three |
---|---|---|---|
str | f64 | f64 | f64 |
"a" | 1.394981 | 1.772517 | NaN |
"b" | 0.343054 | 1.912123 | -0.05039 |
"c" | 0.695246 | 1.478369 | 1.227435 |
"d" | NaN | 0.279344 | -0.613172 |
row_data = df.row(1, named=True)
column = df.select('index', value='two')
print(row_data)
print(column)
{'index': 'b', 'one': 0.343054, 'two': 1.912123, 'three': -0.05039}
shape: (4, 2)
┌───────┬──────────┐
│ index ┆ value │
│ --- ┆ --- │
│ str ┆ f64 │
╞═══════╪══════════╡
│ a ┆ 1.772517 │
│ b ┆ 1.912123 │
│ c ┆ 1.478369 │
│ d ┆ 0.279344 │
└───────┴──────────┘
# df.sub(row, axis="columns")
df.select(
'index',
*[pl.col(name) - val for name, val in row_data.items() if name != "index"]
)
shape: (4, 4)
index | one | two | three |
---|---|---|---|
str | f64 | f64 | f64 |
"a" | 1.051927 | -0.139606 | NaN |
"b" | 0.0 | 0.0 | 0.0 |
"c" | 0.352192 | -0.433754 | 1.277825 |
"d" | NaN | -1.632779 | -0.562782 |
# df.sub(column, axis="index")
df.select(
'index',
pl.exclude('index') - column['value']
)
shape: (4, 4)
index | one | two | three |
---|---|---|---|
str | f64 | f64 | f64 |
"a" | -0.377536 | 0.0 | NaN |
"b" | -1.569069 | 0.0 | -1.962513 |
"c" | -0.783123 | 0.0 | -0.250934 |
"d" | NaN | 0.0 | -0.892516 |
mi = pl.DataFrame(
[(1, "a"), (1, "b"), (1, "c"), (2, "a")],
schema=["first", "second"],
orient="row"
)
dfmi = pl.concat([mi, df.select(pl.exclude('index'))], how='horizontal')
dfmi
shape: (4, 5)
first | second | one | two | three |
---|---|---|---|---|
i64 | str | f64 | f64 | f64 |
1 | "a" | 1.394981 | 1.772517 | NaN |
1 | "b" | 0.343054 | 1.912123 | -0.05039 |
1 | "c" | 0.695246 | 1.478369 | 1.227435 |
2 | "a" | NaN | 0.279344 | -0.613172 |
# dfmi.sub(column, axis=0, level="second")
new_column = column.join(dfmi.select('second'), left_on='index', right_on='second')
dfmi.select(
pl.col('first', 'second'),
pl.exclude('first', 'second') - new_column['value']
)
shape: (4, 5)
first | second | one | two | three |
---|---|---|---|---|
i64 | str | f64 | f64 | f64 |
1 | "a" | -0.377536 | 0.0 | NaN |
1 | "b" | -1.569069 | 0.0 | -1.962513 |
1 | "c" | -0.783123 | 0.0 | -0.250934 |
2 | "a" | NaN | -1.493173 | -2.385689 |
s = pl.Series(np.arange(10))
div, rem = s // 3, s % 3
row(s, div, rem)
shape: (10,)
|
shape: (10,)
|
shape: (10,)
|
Missing data / operations with fill values#
# df2.loc["a", "three"] = 1.0
df2 = df.with_columns(
pl.when(pl.col('index') == 'a')
.then(1.0)
.otherwise('three')
.name.keep()
)
row(df, df2)
shape: (4, 4)
|
shape: (4, 4)
|
# df2.loc["a", "three"] = 1.0
df2 = df.update(pl.select(index=pl.lit('a'), three=1), on='index')
# df + df2
from helper.polars import align_op
align_op(df, df2, on='index', op=pl.Expr.add)
shape: (4, 4)
index | one | two | three |
---|---|---|---|
str | f64 | f64 | f64 |
"a" | 2.789962 | 3.545034 | NaN |
"b" | 0.686108 | 3.824246 | -0.10078 |
"c" | 1.390492 | 2.956738 | 2.45487 |
"d" | NaN | 0.558688 | -1.226344 |
Flexible comparisons#
# df.gt(df2)
align_op(df, df2, on='index', op=pl.Expr.gt)
shape: (4, 4)
index | one | two | three |
---|---|---|---|
str | bool | bool | bool |
"a" | false | false | true |
"b" | false | false | false |
"c" | false | false | false |
"d" | false | false | false |
# df2.ne(df)
align_op(df2, df, on='index', op=pl.Expr.ne)
shape: (4, 4)
index | one | two | three |
---|---|---|---|
str | bool | bool | bool |
"a" | false | false | true |
"b" | false | false | false |
"c" | false | false | false |
"d" | false | false | false |
Boolean reductions#
# (df > 0).all()
df.select((pl.exclude('index') > 0).all())
shape: (1, 3)
one | two | three |
---|---|---|
bool | bool | bool |
true | true | false |
# (df > 0).any()
df.select((pl.exclude('index') > 0).any())
shape: (1, 3)
one | two | three |
---|---|---|
bool | bool | bool |
true | true | true |
# (df > 0).any().any()
df.select(pl.any_horizontal(pl.exclude('index') > 0).any()).item()
True
df.is_empty()
False
pl.DataFrame(schema=list('ABC')).is_empty()
True
Comparing if objects are equivalent#
# df + df == df * 2
align_op(
align_op(df, df, op=pl.Expr.add),
df.select(
'index',
pl.exclude('index') * 2
),
op=pl.Expr.eq
)
shape: (4, 4)
index | one | two | three |
---|---|---|---|
str | bool | bool | bool |
"a" | true | true | true |
"b" | true | true | true |
"c" | true | true | true |
"d" | true | true | true |
In polars, NaN is equal to NaN
pl.Series([np.nan]) == pl.Series([np.nan])
shape: (1,)
bool |
true |
# (df + df).equals(df * 2)
align_op(df, df, op=pl.Expr.add).equals(
df.select(
'index',
pl.exclude('index') * 2
),
)
True
df1 = pl.DataFrame({"index":[0, 1, 2], "col": [1.0, 0, np.nan]})
df2 = pl.DataFrame({"index":[2, 1, 0], "col": [np.nan, 0, 1.0]})
df1.equals(df2)
False
df1.equals(df2.sort('index'))
True
Comparing array-like objects#
pl.Series(["foo", "bar", "baz"]) == "foo"
shape: (3,)
bool |
true |
false |
false |
row(
pl.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"]),
pl.Series(["foo", "bar", "baz"]) == pl.Series(np.array(["foo", "bar", "qux"]))
)
shape: (3,)
|
shape: (3,)
|
Combining overlapping data sets#
df1 = pl.DataFrame({
"A": [1.0, np.nan, 3.0, 5.0, np.nan],
"B": [np.nan, 2.0, 3.0, np.nan, 6.0]
})
df2 = pl.DataFrame({
"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
"B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
})
row(df1, df2)
shape: (5, 2)
|
shape: (6, 2)
|
# df1.combine_first(df2)
(
df1
.with_row_index()
.join(
df2.with_row_index(), on='index', how='left'
)
.fill_nan(None)
.select(
[pl.coalesce(pl.col(c), pl.col(f'{c}_right')) for c in df1.columns]
)
)
shape: (5, 2)
A | B |
---|---|
f64 | f64 |
1.0 | null |
2.0 | 2.0 |
3.0 | 3.0 |
5.0 | 4.0 |
3.0 | 6.0 |
General DataFrame combine#
# df1.combine(df2, combiner)
def combiner(x, y):
return pl.when(x.fill_nan(None).is_null()).then(y).otherwise(x)
align_op(df1.with_row_index(), df2.with_row_index(), combiner, how='full', fill_value=None)
shape: (6, 3)
index | A | B |
---|---|---|
u32 | f64 | f64 |
0 | 1.0 | NaN |
1 | 2.0 | 2.0 |
2 | 3.0 | 3.0 |
3 | 5.0 | 4.0 |
4 | 3.0 | 6.0 |
5 | 7.0 | 8.0 |
Descriptive statistics#
df
shape: (4, 4)
index | one | two | three |
---|---|---|---|
str | f64 | f64 | f64 |
"a" | 1.394981 | 1.772517 | NaN |
"b" | 0.343054 | 1.912123 | -0.05039 |
"c" | 0.695246 | 1.478369 | 1.227435 |
"d" | NaN | 0.279344 | -0.613172 |
# df.mean(0)
df.select(pl.exclude('index').fill_nan(None).mean())
shape: (1, 3)
one | two | three |
---|---|---|
f64 | f64 | f64 |
0.811094 | 1.360588 | 0.187958 |
# df.mean(1)
df.select(
'index',
pl.mean_horizontal(pl.exclude('index').fill_nan(None))
)
shape: (4, 2)
index | one |
---|---|
str | f64 |
"a" | 1.583749 |
"b" | 0.734929 |
"c" | 1.133683 |
"d" | -0.166914 |
# df.sum(0, skipna=False)
df.select(pl.exclude('index').sum())
shape: (1, 3)
one | two | three |
---|---|---|
f64 | f64 | f64 |
NaN | 5.442353 | NaN |
# df.sum(axis=1, skipna=True)
df.select(
'index',
pl.sum_horizontal(pl.exclude('index').fill_nan(None))
)
shape: (4, 2)
index | one |
---|---|
str | f64 |
"a" | 3.167498 |
"b" | 2.204787 |
"c" | 3.40105 |
"d" | -0.333828 |
cols = pl.exclude('index').fill_nan(None)
ts_stand = df.select(
(cols - cols.mean()) / cols.std()
)
ts_stand.std()
shape: (1, 3)
one | two | three |
---|---|---|
f64 | f64 | f64 |
1.0 | 1.0 | 1.0 |
# xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)
cols = pl.exclude('index').fill_nan(None)
cols_list = pl.concat_list(cols).list
xs_stand = df.select(
'index',
(cols - cols_list.mean()) / cols_list.std()
)
xs_stand.select(
'index',
cols_list.std()
)
shape: (4, 2)
index | one |
---|---|
str | f64 |
"a" | 1.0 |
"b" | 1.0 |
"c" | 1.0 |
"d" | 1.0 |
# df.cumsum()
cols = pl.exclude('index').fill_nan(None)
df.select(
'index',
cols.cum_sum()
)
shape: (4, 4)
index | one | two | three |
---|---|---|---|
str | f64 | f64 | f64 |
"a" | 1.394981 | 1.772517 | null |
"b" | 1.738035 | 3.68464 | -0.05039 |
"c" | 2.433281 | 5.163009 | 1.177045 |
"d" | null | 5.442353 | 0.563873 |
np.mean(df['one'].to_numpy())
nan
series = pl.Series(np.random.randn(500))
series[np.arange(20, 500)] = np.nan
series[np.arange(10, 20)] = 5
series.n_unique()
12
series.drop_nans().n_unique()
11
Summarizing data: describe#
series = pl.Series(np.random.randn(1000))
series[np.arange(0, len(series), 2)] = None
series.describe()
shape: (9, 2)
statistic | value |
---|---|
str | f64 |
"count" | 500.0 |
"null_count" | 500.0 |
"mean" | 0.031273 |
"std" | 1.005434 |
"min" | -3.13751 |
"25%" | -0.626407 |
"50%" | 0.057798 |
"75%" | 0.70786 |
"max" | 2.828656 |
frame = pl.DataFrame(np.random.randn(1000, 5), schema=["a", "b", "c", "d", "e"])
# frame.iloc[::2] = np.nan
frame = frame.select(
pl.when(pl.int_range(0, pl.len()) % 2 == 0)
.then(None)
.otherwise(pl.all())
.name.keep()
)
frame.describe()
shape: (9, 6)
statistic | a | b | c | d | e |
---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 |
"count" | 500.0 | 500.0 | 500.0 | 500.0 | 500.0 |
"null_count" | 500.0 | 500.0 | 500.0 | 500.0 | 500.0 |
"mean" | -0.009165 | 0.062481 | 0.038552 | -0.028247 | -0.049076 |
"std" | 0.964467 | 1.007907 | 1.040596 | 0.981704 | 1.02394 |
"min" | -3.296775 | -3.199059 | -3.118371 | -2.742383 | -2.794847 |
"25%" | -0.672506 | -0.58794 | -0.618475 | -0.714627 | -0.755011 |
"50%" | -0.046801 | 0.078398 | -0.028415 | -0.062934 | -0.046535 |
"75%" | 0.645693 | 0.704563 | 0.756428 | 0.627254 | 0.649599 |
"max" | 2.668789 | 2.729561 | 2.750978 | 3.768601 | 3.231683 |
series.describe(percentiles=[0.05, 0.25, 0.75, 0.95])
shape: (10, 2)
statistic | value |
---|---|
str | f64 |
"count" | 500.0 |
"null_count" | 500.0 |
"mean" | 0.031273 |
"std" | 1.005434 |
"min" | -3.13751 |
"5%" | -1.617022 |
"25%" | -0.626407 |
"75%" | 0.70786 |
"95%" | 1.621126 |
"max" | 2.828656 |
s = pl.Series(["a", "a", "b", "b", "a", "a", None, "c", "d", "a"])
s.describe()
shape: (4, 2)
statistic | value |
---|---|
str | str |
"count" | "9" |
"null_count" | "1" |
"min" | "a" |
"max" | "d" |
frame = pl.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)})
frame.describe()
shape: (9, 3)
statistic | a | b |
---|---|---|
str | str | f64 |
"count" | "4" | 4.0 |
"null_count" | "0" | 0.0 |
"mean" | null | 1.5 |
"std" | null | 1.290994 |
"min" | "No" | 0.0 |
"25%" | null | 1.0 |
"50%" | null | 2.0 |
"75%" | null | 2.0 |
"max" | "Yes" | 3.0 |
Index of min/max values#
s1 = pl.Series(np.random.randn(5))
s1
shape: (5,)
f64 |
0.642147 |
1.132986 |
0.536466 |
0.2154 |
0.539621 |
# s1.idxmin(), s1.idxmax()
s1.arg_min(), s1.arg_max()
(3, 1)
df1 = pl.DataFrame(np.random.randn(5, 3), schema=["A", "B", "C"])
df1
shape: (5, 3)
A | B | C |
---|---|---|
f64 | f64 | f64 |
-0.331573 | 0.554591 | -0.581225 |
-0.154205 | 1.388221 | -0.085109 |
0.993925 | -1.25826 | -1.204862 |
-0.268844 | -0.079973 | -1.734103 |
2.140542 | 0.792179 | -1.549576 |
# df1.idxmin(axis=0)
df1.select(pl.all().arg_min())
shape: (1, 3)
A | B | C |
---|---|---|
u32 | u32 | u32 |
0 | 2 | 3 |
# df1.idxmax(axis=1)
df1.select(
arg_max=pl.lit(pl.Series(df1.columns)).get(
pl.concat_list(pl.all()).list.arg_max()
)
)
shape: (5, 1)
arg_max |
---|
str |
"B" |
"B" |
"A" |
"B" |
"A" |
df3 = pl.DataFrame(
[['e', 'd', 'c', 'b', 'a'],
[2, 1, 1, 3, None]], schema=["index", "A"]
)
df3
shape: (5, 2)
index | A |
---|---|
str | i64 |
"e" | 2 |
"d" | 1 |
"c" | 1 |
"b" | 3 |
"a" | null |
# df3["A"].idxmin()
df3.select(
pl.col('index').get(pl.col('A').arg_max())
).item()
'b'
Value counts (histogramming) / mode#
data = np.array(
[6, 6, 2, 3, 5, 3, 2, 5, 4, 5, 4, 3,
4, 5, 0, 2, 0, 4, 2, 0, 3, 2, 2, 5,
6, 5, 3, 4, 6, 4, 3, 5, 6, 4, 3, 6,
2, 6, 6, 2, 3, 4, 2, 1, 6, 2, 6, 1, 5, 4])
s = pl.Series('value', data)
s.value_counts()
shape: (7, 2)
value | count |
---|---|
i32 | u32 |
5 | 8 |
2 | 10 |
6 | 10 |
4 | 9 |
0 | 3 |
1 | 2 |
3 | 8 |
# frame.value_counts()
data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}
frame = pl.DataFrame(data)
frame.select(
pl.struct(pl.all()).value_counts().struct.unnest()
)
shape: (4, 2)
a | count |
---|---|
struct[2] | u32 |
{1,"x"} | 1 |
{3,"y"} | 1 |
{4,"y"} | 1 |
{2,"x"} | 1 |
s5 = pl.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])
s5.mode()
shape: (2,)
i64 |
7 |
3 |
# df5.mode()
df5 = pl.DataFrame(
{
"A": np.random.randint(0, 7, size=50),
"B": np.random.randint(-10, 15, size=50),
}
)
df5.select(pl.all().mode().implode())
shape: (1, 2)
A | B |
---|---|
list[i32] | list[i32] |
[4] | [13, -2] |