10 minutes to pandas#
import numpy as np
import polars as pl
from helper.jupyter import row
def to_dataframe(data):
import re
from io import StringIO
data = re.sub(r'[ \t]+', ',', data.strip())
return pl.read_csv(StringIO(data))
Object creation#
s = pl.Series([1, 3, 5, np.nan, 6, 8], strict=False)
s2 = pl.Series([1, 3, 5, None, 6, 8])
row(s, s2)
shape: (6,)
|
shape: (6,)
|
dates = pl.date_range(
pl.date(2013, 1, 1),
pl.date(2013, 1, 6),
interval='1d', eager=True).alias('index')
dates
shape: (6,)
index |
---|
date |
2013-01-01 |
2013-01-02 |
2013-01-03 |
2013-01-04 |
2013-01-05 |
2013-01-06 |
df = pl.DataFrame(np.random.randn(6, 4), schema=list("ABCD"))
df.insert_column(0, dates)
df
shape: (6, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-01 | -1.281556 | -0.976005 | -0.44965 | 1.508403 |
2013-01-02 | 2.299578 | 0.450922 | -0.758744 | -0.762308 |
2013-01-03 | 0.489373 | -0.124009 | 0.597175 | -0.365254 |
2013-01-04 | -0.689202 | 0.040563 | 0.883405 | 0.400285 |
2013-01-05 | 0.972164 | 1.596552 | 1.191523 | -1.907236 |
2013-01-06 | 0.830918 | 0.211487 | 0.91045 | 1.73558 |
arr = np.array([[ 0.4691, -0.2829, -1.5091, -1.1356],
[ 1.2121, -0.1732, 0.1192, -1.0442],
[-0.8618, -2.1046, -0.4949, 1.0718],
[ 0.7216, -0.7068, -1.0396, 0.2719],
[-0.425 , 0.567 , 0.2762, -1.0874],
[-0.6737, 0.1136, -1.4784, 0.525 ]])
df = pl.DataFrame(arr, schema=list("ABCD"))
df.insert_column(0, dates)
shape: (6, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-01 | 0.4691 | -0.2829 | -1.5091 | -1.1356 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | -1.0442 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 1.0718 |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 0.2719 |
2013-01-05 | -0.425 | 0.567 | 0.2762 | -1.0874 |
2013-01-06 | -0.6737 | 0.1136 | -1.4784 | 0.525 |
df2 = pl.select(
A=1.0,
B=pl.date(2013, 1, 2),
D=np.array([3] * 4, dtype="int32"),
E=pl.lit(pl.Series(['test', 'train', 'test', 'train'], dtype=pl.Categorical)),
F=pl.lit("foo"),
)
df2
shape: (4, 5)
A | B | D | E | F |
---|---|---|---|---|
f64 | date | i32 | cat | str |
1.0 | 2013-01-02 | 3 | "test" | "foo" |
1.0 | 2013-01-02 | 3 | "train" | "foo" |
1.0 | 2013-01-02 | 3 | "test" | "foo" |
1.0 | 2013-01-02 | 3 | "train" | "foo" |
df2.dtypes
[Float64, Date, Int32, Categorical(ordering='physical'), String]
df2.schema
Schema([('A', Float64),
('B', Date),
('D', Int32),
('E', Categorical(ordering='physical')),
('F', String)])
Viewing data#
df.head(5)
shape: (5, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-01 | 0.4691 | -0.2829 | -1.5091 | -1.1356 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | -1.0442 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 1.0718 |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 0.2719 |
2013-01-05 | -0.425 | 0.567 | 0.2762 | -1.0874 |
df.tail(3)
shape: (3, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 0.2719 |
2013-01-05 | -0.425 | 0.567 | 0.2762 | -1.0874 |
2013-01-06 | -0.6737 | 0.1136 | -1.4784 | 0.525 |
df.get_column('index')
shape: (6,)
index |
---|
date |
2013-01-01 |
2013-01-02 |
2013-01-03 |
2013-01-04 |
2013-01-05 |
2013-01-06 |
df.columns
['index', 'A', 'B', 'C', 'D']
df.drop('index').to_numpy()
array([[ 0.4691, -0.2829, -1.5091, -1.1356],
[ 1.2121, -0.1732, 0.1192, -1.0442],
[-0.8618, -2.1046, -0.4949, 1.0718],
[ 0.7216, -0.7068, -1.0396, 0.2719],
[-0.425 , 0.567 , 0.2762, -1.0874],
[-0.6737, 0.1136, -1.4784, 0.525 ]])
df2.dtypes
[Float64, Date, Int32, Categorical(ordering='physical'), String]
df2.schema
Schema([('A', Float64),
('B', Date),
('D', Int32),
('E', Categorical(ordering='physical')),
('F', String)])
df2.to_numpy()
array([[1.0, datetime.date(2013, 1, 2), 3, 'test', 'foo'],
[1.0, datetime.date(2013, 1, 2), 3, 'train', 'foo'],
[1.0, datetime.date(2013, 1, 2), 3, 'test', 'foo'],
[1.0, datetime.date(2013, 1, 2), 3, 'train', 'foo']], dtype=object)
df2.to_numpy(structured=True)
array([(1., '2013-01-02', 3, 'test', 'foo'),
(1., '2013-01-02', 3, 'train', 'foo'),
(1., '2013-01-02', 3, 'test', 'foo'),
(1., '2013-01-02', 3, 'train', 'foo')],
dtype=[('A', '<f8'), ('B', '<M8[D]'), ('D', '<i4'), ('E', 'O'), ('F', '<U3')])
df.describe()
shape: (9, 6)
statistic | index | A | B | C | D |
---|---|---|---|---|---|
str | str | f64 | f64 | f64 | f64 |
"count" | "6" | 6.0 | 6.0 | 6.0 | 6.0 |
"null_count" | "0" | 0.0 | 0.0 | 0.0 | 0.0 |
"mean" | "2013-01-03 12:00:00" | 0.073717 | -0.43115 | -0.687767 | -0.233083 |
"std" | null | 0.843153 | 0.922821 | 0.779885 | 0.973111 |
"min" | "2013-01-01" | -0.8618 | -2.1046 | -1.5091 | -1.1356 |
"25%" | "2013-01-02" | -0.6737 | -0.7068 | -1.4784 | -1.0874 |
"50%" | "2013-01-04" | 0.4691 | -0.1732 | -0.4949 | 0.2719 |
"75%" | "2013-01-05" | 0.7216 | 0.1136 | 0.1192 | 0.525 |
"max" | "2013-01-06" | 1.2121 | 0.567 | 0.2762 | 1.0718 |
df.drop('index').transpose(
include_header=True,
column_names=df.get_column('index').cast(pl.String))
shape: (4, 7)
column | 2013-01-01 | 2013-01-02 | 2013-01-03 | 2013-01-04 | 2013-01-05 | 2013-01-06 |
---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | f64 |
"A" | 0.4691 | 1.2121 | -0.8618 | 0.7216 | -0.425 | -0.6737 |
"B" | -0.2829 | -0.1732 | -2.1046 | -0.7068 | 0.567 | 0.1136 |
"C" | -1.5091 | 0.1192 | -0.4949 | -1.0396 | 0.2762 | -1.4784 |
"D" | -1.1356 | -1.0442 | 1.0718 | 0.2719 | -1.0874 | 0.525 |
df.select('index', *sorted(df.columns[1:], reverse=True))
shape: (6, 5)
index | D | C | B | A |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-01 | -1.1356 | -1.5091 | -0.2829 | 0.4691 |
2013-01-02 | -1.0442 | 0.1192 | -0.1732 | 1.2121 |
2013-01-03 | 1.0718 | -0.4949 | -2.1046 | -0.8618 |
2013-01-04 | 0.2719 | -1.0396 | -0.7068 | 0.7216 |
2013-01-05 | -1.0874 | 0.2762 | 0.567 | -0.425 |
2013-01-06 | 0.525 | -1.4784 | 0.1136 | -0.6737 |
df.sort('B')
shape: (6, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 1.0718 |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 0.2719 |
2013-01-01 | 0.4691 | -0.2829 | -1.5091 | -1.1356 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | -1.0442 |
2013-01-06 | -0.6737 | 0.1136 | -1.4784 | 0.525 |
2013-01-05 | -0.425 | 0.567 | 0.2762 | -1.0874 |
Selection#
Getitem ([])#
df['A']
shape: (6,)
A |
---|
f64 |
0.4691 |
1.2121 |
-0.8618 |
0.7216 |
-0.425 |
-0.6737 |
df.slice(0, 3)
shape: (3, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-01 | 0.4691 | -0.2829 | -1.5091 | -1.1356 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | -1.0442 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 1.0718 |
df.filter(
pl.col('index').is_between(
pl.date(2013, 1, 2),
pl.date(2013, 1, 4)
)
)
shape: (3, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | -1.0442 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 1.0718 |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 0.2719 |
Selection by label#
df.filter(pl.col('index') == pl.col('index').first())
shape: (1, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-01 | 0.4691 | -0.2829 | -1.5091 | -1.1356 |
df.select('index', 'A', 'B')
shape: (6, 3)
index | A | B |
---|---|---|
date | f64 | f64 |
2013-01-01 | 0.4691 | -0.2829 |
2013-01-02 | 1.2121 | -0.1732 |
2013-01-03 | -0.8618 | -2.1046 |
2013-01-04 | 0.7216 | -0.7068 |
2013-01-05 | -0.425 | 0.567 |
2013-01-06 | -0.6737 | 0.1136 |
(df
.filter(
pl.col('index').is_between(
pl.date(2013, 1, 2),
pl.date(2013, 1, 4)
)
)
.select('index', 'A', 'B')
)
shape: (3, 3)
index | A | B |
---|---|---|
date | f64 | f64 |
2013-01-02 | 1.2121 | -0.1732 |
2013-01-03 | -0.8618 | -2.1046 |
2013-01-04 | 0.7216 | -0.7068 |
(
df
.filter(pl.col('index') == pl.col('index').first())
.select('A')
.item()
)
0.4691
df.row(0, named=True)['A']
0.4691
Selection by position#
df.slice(3, 1)
shape: (1, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 0.2719 |
df.select(pl.nth(range(3)).slice(3, 2))
shape: (2, 3)
index | A | B |
---|---|---|
date | f64 | f64 |
2013-01-04 | 0.7216 | -0.7068 |
2013-01-05 | -0.425 | 0.567 |
df.select(pl.nth([0, 1, 3]).gather([1, 2, 4]))
shape: (3, 3)
index | A | C |
---|---|---|
date | f64 | f64 |
2013-01-02 | 1.2121 | 0.1192 |
2013-01-03 | -0.8618 | -0.4949 |
2013-01-05 | -0.425 | 0.2762 |
df.slice(1, 2)
shape: (2, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | -1.0442 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 1.0718 |
df.select(pl.nth(0), pl.nth(range(2, 4)))
shape: (6, 3)
index | B | C |
---|---|---|
date | f64 | f64 |
2013-01-01 | -0.2829 | -1.5091 |
2013-01-02 | -0.1732 | 0.1192 |
2013-01-03 | -2.1046 | -0.4949 |
2013-01-04 | -0.7068 | -1.0396 |
2013-01-05 | 0.567 | 0.2762 |
2013-01-06 | 0.1136 | -1.4784 |
df.row(1)[2]
-0.1732
Boolean indexing#
df.filter(pl.col('A') > 0)
shape: (3, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-01 | 0.4691 | -0.2829 | -1.5091 | -1.1356 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | -1.0442 |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 0.2719 |
df.select(
pl.col('index'),
pl.when(pl.exclude('index') > 0)
.then(pl.exclude('index'))
.otherwise(None)
)
shape: (6, 5)
index | A | B | C | D |
---|---|---|---|---|
date | f64 | f64 | f64 | f64 |
2013-01-01 | 0.4691 | null | null | null |
2013-01-02 | 1.2121 | null | 0.1192 | null |
2013-01-03 | null | null | null | 1.0718 |
2013-01-04 | 0.7216 | null | null | 0.2719 |
2013-01-05 | null | 0.567 | 0.2762 | null |
2013-01-06 | null | 0.1136 | null | 0.525 |
df2 = df.with_columns(
E=pl.Series(['one', 'one', 'two', 'three', 'four', 'three'])
)
df2
shape: (6, 6)
index | A | B | C | D | E |
---|---|---|---|---|---|
date | f64 | f64 | f64 | f64 | str |
2013-01-01 | 0.4691 | -0.2829 | -1.5091 | -1.1356 | "one" |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | -1.0442 | "one" |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 1.0718 | "two" |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 0.2719 | "three" |
2013-01-05 | -0.425 | 0.567 | 0.2762 | -1.0874 | "four" |
2013-01-06 | -0.6737 | 0.1136 | -1.4784 | 0.525 | "three" |
df2.filter(pl.col('E').is_in(['two', 'four']))
shape: (2, 6)
index | A | B | C | D | E |
---|---|---|---|---|---|
date | f64 | f64 | f64 | f64 | str |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 1.0718 | "two" |
2013-01-05 | -0.425 | 0.567 | 0.2762 | -1.0874 | "four" |
Setting#
# s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
s1 = pl.select(
index=pl.date_range(pl.date(2013, 1, 2), pl.date(2013, 1, 7)),
value=pl.Series([1, 2, 3, 4, 5, 6])
)
s1
shape: (6, 2)
index | value |
---|---|
date | i64 |
2013-01-02 | 1 |
2013-01-03 | 2 |
2013-01-04 | 3 |
2013-01-05 | 4 |
2013-01-06 | 5 |
2013-01-07 | 6 |
# df["F"] = s1
df = df.join(s1.select('index', F='value'), on='index', how='left')
# df.at[dates[0], "A"] = 0
df = df.with_columns(
pl.when(pl.col('index') == dates[0])
.then(0)
.otherwise('A')
.alias('A')
)
# df.iat[0, 1] = 0
df = df.with_columns(
pl.when(pl.int_range(pl.len()) == 0)
.then(0)
.otherwise(pl.nth(2))
.name.keep()
)
# df.loc[:, "D"] = np.array([5] * len(df))
df = df.with_columns(D=5)
df
shape: (6, 6)
index | A | B | C | D | F |
---|---|---|---|---|---|
date | f64 | f64 | f64 | i32 | i64 |
2013-01-01 | 0.0 | 0.0 | -1.5091 | 5 | null |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | 5 | 1 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 5 | 2 |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 5 | 3 |
2013-01-05 | -0.425 | 0.567 | 0.2762 | 5 | 4 |
2013-01-06 | -0.6737 | 0.1136 | -1.4784 | 5 | 5 |
# df2[df2 > 0] = -df2
df2 = df.select(pl.all())
others = pl.exclude('index')
df2 = df2.select(
'index',
pl.when(others > 0)
.then(-others)
.otherwise(others)
)
df2
shape: (6, 6)
index | A | B | C | D | F |
---|---|---|---|---|---|
date | f64 | f64 | f64 | i32 | i64 |
2013-01-01 | 0.0 | 0.0 | -1.5091 | -5 | null |
2013-01-02 | -1.2121 | -0.1732 | -0.1192 | -5 | -1 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | -5 | -2 |
2013-01-04 | -0.7216 | -0.7068 | -1.0396 | -5 | -3 |
2013-01-05 | -0.425 | -0.567 | -0.2762 | -5 | -4 |
2013-01-06 | -0.6737 | -0.1136 | -1.4784 | -5 | -5 |
Missing data#
df1 = df.select(
pl.all().slice(0, 4),
E=pl.Series([1.0, 1.0, None, None])
)
df1
shape: (4, 7)
index | A | B | C | D | F | E |
---|---|---|---|---|---|---|
date | f64 | f64 | f64 | i32 | i64 | f64 |
2013-01-01 | 0.0 | 0.0 | -1.5091 | 5 | null | 1.0 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | 5 | 1 | 1.0 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 5 | 2 | null |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 5 | 3 | null |
df1.drop_nulls()
shape: (1, 7)
index | A | B | C | D | F | E |
---|---|---|---|---|---|---|
date | f64 | f64 | f64 | i32 | i64 | f64 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | 5 | 1 | 1.0 |
df1.fill_null(5)
shape: (4, 7)
index | A | B | C | D | F | E |
---|---|---|---|---|---|---|
date | f64 | f64 | f64 | i32 | i64 | f64 |
2013-01-01 | 0.0 | 0.0 | -1.5091 | 5 | 5 | 1.0 |
2013-01-02 | 1.2121 | -0.1732 | 0.1192 | 5 | 1 | 1.0 |
2013-01-03 | -0.8618 | -2.1046 | -0.4949 | 5 | 2 | 5.0 |
2013-01-04 | 0.7216 | -0.7068 | -1.0396 | 5 | 3 | 5.0 |
df1.select(
'index',
pl.exclude('index').is_null()
)
shape: (4, 7)
index | A | B | C | D | F | E |
---|---|---|---|---|---|---|
date | bool | bool | bool | bool | bool | bool |
2013-01-01 | false | false | false | false | true | false |
2013-01-02 | false | false | false | false | false | false |
2013-01-03 | false | false | false | false | false | true |
2013-01-04 | false | false | false | false | false | true |
Operations#
Stats#
df.select(pl.exclude('index').mean())
shape: (1, 5)
A | B | C | D | F |
---|---|---|---|---|
f64 | f64 | f64 | f64 | f64 |
-0.004467 | -0.384 | -0.687767 | 5.0 | 3.0 |
df.select(
'index',
pl.mean_horizontal(pl.exclude('index'))
)
shape: (6, 2)
index | A |
---|---|
date | f64 |
2013-01-01 | 0.872725 |
2013-01-02 | 1.43162 |
2013-01-03 | 0.70774 |
2013-01-04 | 1.39504 |
2013-01-05 | 1.88364 |
2013-01-06 | 1.5923 |
s = pl.select(
index=df.get_column('index'),
value=pl.Series([1, 3, 5, None, 6, 8]).shift(2)
)
s
shape: (6, 2)
index | value |
---|---|
date | i64 |
2013-01-01 | null |
2013-01-02 | null |
2013-01-03 | 1 |
2013-01-04 | 3 |
2013-01-05 | 5 |
2013-01-06 | null |
df_l, df_r = pl.align_frames(df, s, on='index')
df_l.select(
'index',
pl.exclude('index') - df_r.get_column('value')
)
shape: (6, 6)
index | A | B | C | D | F |
---|---|---|---|---|---|
date | f64 | f64 | f64 | i64 | i64 |
2013-01-01 | null | null | null | null | null |
2013-01-02 | null | null | null | null | null |
2013-01-03 | -1.8618 | -3.1046 | -1.4949 | 4 | 1 |
2013-01-04 | -2.2784 | -3.7068 | -4.0396 | 2 | 0 |
2013-01-05 | -5.425 | -4.433 | -4.7238 | 0 | -1 |
2013-01-06 | null | null | null | null | null |
User defined functions#
# df.agg(lambda x: np.mean(x) * 5.6)
df.select(
pl.exclude('index').mean() * 5.6
)
shape: (1, 5)
A | B | C | D | F |
---|---|---|---|---|
f64 | f64 | f64 | f64 | f64 |
-0.025013 | -2.1504 | -3.851493 | 28.0 | 16.8 |
# df.transform(lambda x: x * 101.2)
df.select(
pl.exclude('index') * 101.2
)
shape: (6, 5)
A | B | C | D | F |
---|---|---|---|---|
f64 | f64 | f64 | f64 | f64 |
0.0 | 0.0 | -152.72092 | 506.0 | null |
122.66452 | -17.52784 | 12.06304 | 506.0 | 101.2 |
-87.21416 | -212.98552 | -50.08388 | 506.0 | 202.4 |
73.02592 | -71.52816 | -105.20752 | 506.0 | 303.6 |
-43.01 | 57.3804 | 27.95144 | 506.0 | 404.8 |
-68.17844 | 11.49632 | -149.61408 | 506.0 | 506.0 |
Value Counts#
s = pl.Series(np.random.randint(0, 7, size=10))
s.value_counts()
shape: (5, 2)
count | |
---|---|
i32 | u32 |
6 | 2 |
4 | 1 |
0 | 1 |
5 | 3 |
2 | 3 |
String Methods#
s = pl.Series(["A", "B", "C", "Aaba", "Baca", None, "CABA", "dog", "cat"])
s.str.to_lowercase()
shape: (9,)
str |
"a" |
"b" |
"c" |
"aaba" |
"baca" |
null |
"caba" |
"dog" |
"cat" |
Merge#
Concat#
data = """
index 0 1 2 3
0 -0.548702 1.467327 -1.015962 -0.483075
1 1.637550 -1.217659 -0.291519 -1.745505
2 -0.263952 0.991460 -0.919069 0.266046
3 -0.709661 1.669052 1.037882 -1.705775
4 -0.919854 -0.042379 1.247642 -0.009920
5 0.290213 0.495767 0.362949 1.548106
6 -1.131345 -0.089329 0.337863 -0.945867
7 -0.932132 1.956030 0.017587 -0.016692
8 -0.575247 0.254161 -1.143704 0.215897
9 1.193555 -0.077118 -0.408530 -0.862495
"""
df = to_dataframe(data)
df
shape: (10, 5)
index | 0 | 1 | 2 | 3 |
---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 |
0 | -0.548702 | 1.467327 | -1.015962 | -0.483075 |
1 | 1.63755 | -1.217659 | -0.291519 | -1.745505 |
2 | -0.263952 | 0.99146 | -0.919069 | 0.266046 |
3 | -0.709661 | 1.669052 | 1.037882 | -1.705775 |
4 | -0.919854 | -0.042379 | 1.247642 | -0.00992 |
5 | 0.290213 | 0.495767 | 0.362949 | 1.548106 |
6 | -1.131345 | -0.089329 | 0.337863 | -0.945867 |
7 | -0.932132 | 1.95603 | 0.017587 | -0.016692 |
8 | -0.575247 | 0.254161 | -1.143704 | 0.215897 |
9 | 1.193555 | -0.077118 | -0.40853 | -0.862495 |
pl.concat([df.slice(0, 3), df.slice(3, 4), df.slice(7)])
shape: (10, 5)
index | 0 | 1 | 2 | 3 |
---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 |
0 | -0.548702 | 1.467327 | -1.015962 | -0.483075 |
1 | 1.63755 | -1.217659 | -0.291519 | -1.745505 |
2 | -0.263952 | 0.99146 | -0.919069 | 0.266046 |
3 | -0.709661 | 1.669052 | 1.037882 | -1.705775 |
4 | -0.919854 | -0.042379 | 1.247642 | -0.00992 |
5 | 0.290213 | 0.495767 | 0.362949 | 1.548106 |
6 | -1.131345 | -0.089329 | 0.337863 | -0.945867 |
7 | -0.932132 | 1.95603 | 0.017587 | -0.016692 |
8 | -0.575247 | 0.254161 | -1.143704 | 0.215897 |
9 | 1.193555 | -0.077118 | -0.40853 | -0.862495 |
join#
left = pl.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pl.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
row(left, right)
shape: (2, 2)
|
shape: (2, 2)
|
left.join(right, on='key')
shape: (4, 3)
key | lval | rval |
---|---|---|
str | i64 | i64 |
"foo" | 1 | 4 |
"foo" | 2 | 4 |
"foo" | 1 | 5 |
"foo" | 2 | 5 |
left = pl.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})
right = pl.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]})
row(left, right)
shape: (2, 2)
|
shape: (2, 2)
|
left.join(right, on='key')
shape: (2, 3)
key | lval | rval |
---|---|---|
str | i64 | i64 |
"foo" | 1 | 4 |
"bar" | 2 | 5 |
Grouping#
data = """
A B C D
foo one 1.346061 -1.577585
bar one 1.511763 0.396823
foo two 1.627081 -0.105381
bar three -0.990582 -0.532532
foo two -0.441652 1.453749
bar two 1.211526 1.208843
foo one 0.268520 -0.080952
foo three 0.024580 -0.264610
"""
df = to_dataframe(data)
# df.groupby("A")[["C", "D"]].sum()
df.group_by('A').agg(
pl.col('C', 'D').sum()
)
shape: (2, 3)
A | C | D |
---|---|---|
str | f64 | f64 |
"foo" | 2.82459 | -0.574779 |
"bar" | 1.732707 | 1.073134 |
# df.groupby(["A", "B"]).sum()
(
df
.group_by('A', 'B')
.agg(pl.all().sum())
.sort('A', 'B')
)
shape: (6, 4)
A | B | C | D |
---|---|---|---|
str | str | f64 | f64 |
"bar" | "one" | 1.511763 | 0.396823 |
"bar" | "three" | -0.990582 | -0.532532 |
"bar" | "two" | 1.211526 | 1.208843 |
"foo" | "one" | 1.614581 | -1.658537 |
"foo" | "three" | 0.02458 | -0.26461 |
"foo" | "two" | 1.185429 | 1.348368 |
Reshaping#
Stack#
data = '''
first second A B
bar one -0.727965 -0.589346
bar two 0.339969 -0.693205
baz one -0.339355 0.593616
baz two 0.884345 1.591431
'''
df2 = to_dataframe(data)
df2
shape: (4, 4)
first | second | A | B |
---|---|---|---|
str | str | f64 | f64 |
"bar" | "one" | -0.727965 | -0.589346 |
"bar" | "two" | 0.339969 | -0.693205 |
"baz" | "one" | -0.339355 | 0.593616 |
"baz" | "two" | 0.884345 | 1.591431 |
stacked = df2.unpivot(['A', 'B'], index=['first', 'second'])
stacked
shape: (8, 4)
first | second | variable | value |
---|---|---|---|
str | str | str | f64 |
"bar" | "one" | "A" | -0.727965 |
"bar" | "two" | "A" | 0.339969 |
"baz" | "one" | "A" | -0.339355 |
"baz" | "two" | "A" | 0.884345 |
"bar" | "one" | "B" | -0.589346 |
"bar" | "two" | "B" | -0.693205 |
"baz" | "one" | "B" | 0.593616 |
"baz" | "two" | "B" | 1.591431 |
stacked.pivot('variable', index=['first', 'second'])
shape: (4, 4)
first | second | A | B |
---|---|---|---|
str | str | f64 | f64 |
"bar" | "one" | -0.727965 | -0.589346 |
"bar" | "two" | 0.339969 | -0.693205 |
"baz" | "one" | -0.339355 | 0.593616 |
"baz" | "two" | 0.884345 | 1.591431 |
stacked.pivot('second', index=['first', 'variable'])
shape: (4, 4)
first | variable | one | two |
---|---|---|---|
str | str | f64 | f64 |
"bar" | "A" | -0.727965 | 0.339969 |
"baz" | "A" | -0.339355 | 0.884345 |
"bar" | "B" | -0.589346 | -0.693205 |
"baz" | "B" | 0.593616 | 1.591431 |
stacked.pivot('first', index=['second', 'variable'])
shape: (4, 4)
second | variable | bar | baz |
---|---|---|---|
str | str | f64 | f64 |
"one" | "A" | -0.727965 | -0.339355 |
"two" | "A" | 0.339969 | 0.884345 |
"one" | "B" | -0.589346 | 0.593616 |
"two" | "B" | -0.693205 | 1.591431 |
Pivot tables#
data = '''
index A B C D E
0 one A foo -1.202872 0.047609
1 one B foo -1.814470 -0.136473
2 two C foo 1.018601 -0.561757
3 three A bar -0.595447 -1.623033
4 one B bar 1.395433 0.029399
5 one C bar -0.392670 -0.542108
6 two A foo 0.007207 0.282696
7 three B foo 1.928123 -0.087302
8 one C foo -0.055224 -1.575170
9 one A bar 2.395985 1.771208
10 two B bar 1.552825 0.816482
11 three C bar 0.166599 1.100230
'''
df = to_dataframe(data)
df.pivot('C', index=['A', 'B'], values="D").sort('A', 'B')
shape: (9, 4)
A | B | foo | bar |
---|---|---|---|
str | str | f64 | f64 |
"one" | "A" | -1.202872 | 2.395985 |
"one" | "B" | -1.81447 | 1.395433 |
"one" | "C" | -0.055224 | -0.39267 |
"three" | "A" | null | -0.595447 |
"three" | "B" | 1.928123 | null |
"three" | "C" | null | 0.166599 |
"two" | "A" | 0.007207 | null |
"two" | "B" | null | 1.552825 |
"two" | "C" | 1.018601 | null |
Time series#
n = 1000
rng = pl.datetime(2012, 1, 1) + pl.duration(seconds=pl.int_range(0, n))
ts = pl.select(
index=rng,
value=np.random.randint(0, 500, n)
)
(
ts
.group_by(pl.col('index').dt.truncate('5m'), maintain_order=True)
.agg(pl.col('value').sum())
)
shape: (4, 2)
index | value |
---|---|
datetime[μs] | i32 |
2012-01-01 00:00:00 | 72130 |
2012-01-01 00:05:00 | 76640 |
2012-01-01 00:10:00 | 71218 |
2012-01-01 00:15:00 | 25874 |
rng = pl.datetime(2012, 3, 6) + pl.duration(days=pl.int_range(0, 5))
ts = pl.select(
index=rng,
value=np.random.randn(5)
)
ts
shape: (5, 2)
index | value |
---|---|
datetime[μs] | f64 |
2012-03-06 00:00:00 | -0.037589 |
2012-03-07 00:00:00 | -0.422765 |
2012-03-08 00:00:00 | 0.148907 |
2012-03-09 00:00:00 | -0.02686 |
2012-03-10 00:00:00 | 0.813454 |
ts_utc = ts.with_columns(
pl.col('index').dt.replace_time_zone("UTC")
)
ts_utc.with_columns(
pl.col('index').dt.convert_time_zone('US/Eastern')
)
shape: (5, 2)
index | value |
---|---|
datetime[μs, US/Eastern] | f64 |
2012-03-05 19:00:00 EST | -0.037589 |
2012-03-06 19:00:00 EST | -0.422765 |
2012-03-07 19:00:00 EST | 0.148907 |
2012-03-08 19:00:00 EST | -0.02686 |
2012-03-09 19:00:00 EST | 0.813454 |
ts.with_columns(
pl.col('index').dt.add_business_days(5, roll="backward")
)
shape: (5, 2)
index | value |
---|---|
datetime[μs] | f64 |
2012-03-13 00:00:00 | -0.037589 |
2012-03-14 00:00:00 | -0.422765 |
2012-03-15 00:00:00 | 0.148907 |
2012-03-16 00:00:00 | -0.02686 |
2012-03-16 00:00:00 | 0.813454 |
Categoricals#
df = pl.DataFrame({
"id": [1, 2, 3, 4, 5, 6],
"raw_grade": ["a", "b", "b", "a", "a", "e"]
})
df = df.with_columns(
grade=pl.col('raw_grade').cast(pl.Categorical)
)
df
shape: (6, 3)
id | raw_grade | grade |
---|---|---|
i64 | str | cat |
1 | "a" | "a" |
2 | "b" | "b" |
3 | "b" | "b" |
4 | "a" | "a" |
5 | "a" | "a" |
6 | "e" | "e" |
df = df.with_columns(
pl.col('grade')
.cast(pl.String)
.replace(dict(a='very good', b='good', e="very bad"))
.cast(pl.Categorical)
)
grades = pl.Enum(["very bad", "bad", "medium", "good", "very good"])
df = df.with_columns(
pl.col('grade').cast(grades)
)
df.sort('grade')
shape: (6, 3)
id | raw_grade | grade |
---|---|---|
i64 | str | enum |
6 | "e" | "very bad" |
2 | "b" | "good" |
3 | "b" | "good" |
1 | "a" | "very good" |
4 | "a" | "very good" |
5 | "a" | "very good" |
# df.groupby("grade", observed=False).size()
(
df
.group_by('grade')
.agg(pl.len())
.join(pl.select(grade=grades.categories.cast(grades)), on='grade', how='right')
.with_columns(pl.col('len').fill_null(0))
)
shape: (5, 2)
len | grade |
---|---|
u32 | enum |
1 | "very bad" |
0 | "bad" |
0 | "medium" |
2 | "good" |
3 | "very good" |
Plotting#
import hvplot.polars
ts = pl.select(
index=pl.datetime(2000, 1, 1) + pl.duration(days=pl.int_range(1000)),
value=np.random.randn(1000)
)
ts = ts.with_columns(pl.col('value').cum_sum())
ts.hvplot.line(x='index')
df = pl.DataFrame(
np.random.randn(1000, 4), schema=['A', 'B', 'C', 'D']
)
df = df.with_columns(
ts['index'],
pl.all().cum_sum()
)
df.hvplot.line(x='index')
Importing and exporting data#
CSV#
df = pl.DataFrame(np.random.randint(0, 5, (10, 5)))
df.write_csv('data/pandas_foo.csv')
pl.read_csv('data/pandas_foo.csv')
shape: (10, 5)
column_0 | column_1 | column_2 | column_3 | column_4 |
---|---|---|---|---|
i64 | i64 | i64 | i64 | i64 |
1 | 2 | 0 | 0 | 1 |
3 | 1 | 0 | 4 | 1 |
3 | 1 | 0 | 1 | 2 |
2 | 1 | 1 | 0 | 0 |
0 | 3 | 0 | 4 | 4 |
4 | 4 | 4 | 3 | 0 |
2 | 0 | 2 | 2 | 4 |
0 | 2 | 1 | 3 | 3 |
4 | 0 | 3 | 2 | 4 |
2 | 1 | 4 | 1 | 1 |
Parquet#
df.write_parquet('data/pandas_foo.parquet')
pl.read_parquet('data/pandas_foo.parquet')
shape: (10, 5)
column_0 | column_1 | column_2 | column_3 | column_4 |
---|---|---|---|---|
i32 | i32 | i32 | i32 | i32 |
1 | 2 | 0 | 0 | 1 |
3 | 1 | 0 | 4 | 1 |
3 | 1 | 0 | 1 | 2 |
2 | 1 | 1 | 0 | 0 |
0 | 3 | 0 | 4 | 4 |
4 | 4 | 4 | 3 | 0 |
2 | 0 | 2 | 2 | 4 |
0 | 2 | 1 | 3 | 3 |
4 | 0 | 3 | 2 | 4 |
2 | 1 | 4 | 1 | 1 |
Excel#
df.write_excel('data/pandas_foo.xlsx', 'Sheet1')
<xlsxwriter.workbook.Workbook at 0x17dbc86c950>
pl.read_excel('data/pandas_foo.xlsx', sheet_name='Sheet1')
shape: (10, 5)
column_0 | column_1 | column_2 | column_3 | column_4 |
---|---|---|---|---|
i64 | i64 | i64 | i64 | i64 |
1 | 2 | 0 | 0 | 1 |
3 | 1 | 0 | 4 | 1 |
3 | 1 | 0 | 1 | 2 |
2 | 1 | 1 | 0 | 0 |
0 | 3 | 0 | 4 | 4 |
4 | 4 | 4 | 3 | 0 |
2 | 0 | 2 | 2 | 4 |
0 | 2 | 1 | 3 | 3 |
4 | 0 | 3 | 2 | 4 |
2 | 1 | 4 | 1 | 1 |
Gotchas#
if pl.Series([False, True, False]):
print("I was true")
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[199], line 1
----> 1 if pl.Series([False, True, False]):
2 print("I was true")
File C:\mamba\envs\py312\Lib\site-packages\polars\series\series.py:636, in Series.__bool__(self)
626 def __bool__(self) -> NoReturn:
627 msg = (
628 "the truth value of a Series is ambiguous"
629 "\n\n"
(...)
634 "- instead of `s in [y, z]`, use `s.is_in([y, z])`\n"
635 )
--> 636 raise TypeError(msg)
TypeError: the truth value of a Series is ambiguous
Here are some things you might want to try:
- instead of `if s`, use `if not s.is_empty()`
- instead of `s1 and s2`, use `s1 & s2`
- instead of `s1 or s2`, use `s1 | s2`
- instead of `s in [y, z]`, use `s.is_in([y, z])`