10 minutes to pandas

10 minutes to pandas#

import numpy as np
import polars as pl
from helper.jupyter import row

def to_dataframe(data):
    import re
    from io import StringIO
    data = re.sub(r'[ \t]+', ',', data.strip())
    return pl.read_csv(StringIO(data))

Object creation#

s = pl.Series([1, 3, 5, np.nan, 6, 8], strict=False)
s2 = pl.Series([1, 3, 5, None, 6, 8])
row(s, s2)

shape: (6,)


f64
1.0
3.0
5.0
NaN
6.0
8.0

shape: (6,)


i64
1
3
5
null
6
8

dates = pl.date_range(
    pl.date(2013, 1, 1), 
    pl.date(2013, 1, 6), 
    interval='1d', eager=True).alias('index')
dates

shape: (6,)

index
date
2013-01-01
2013-01-02
2013-01-03
2013-01-04
2013-01-05
2013-01-06

df = pl.DataFrame(np.random.randn(6, 4), schema=list("ABCD"))
df.insert_column(0, dates)
df

shape: (6, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-01	-1.281556	-0.976005	-0.44965	1.508403
2013-01-02	2.299578	0.450922	-0.758744	-0.762308
2013-01-03	0.489373	-0.124009	0.597175	-0.365254
2013-01-04	-0.689202	0.040563	0.883405	0.400285
2013-01-05	0.972164	1.596552	1.191523	-1.907236
2013-01-06	0.830918	0.211487	0.91045	1.73558

arr = np.array([[ 0.4691, -0.2829, -1.5091, -1.1356],
       [ 1.2121, -0.1732,  0.1192, -1.0442],
       [-0.8618, -2.1046, -0.4949,  1.0718],
       [ 0.7216, -0.7068, -1.0396,  0.2719],
       [-0.425 ,  0.567 ,  0.2762, -1.0874],
       [-0.6737,  0.1136, -1.4784,  0.525 ]])
df = pl.DataFrame(arr, schema=list("ABCD"))
df.insert_column(0, dates)

shape: (6, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-01	0.4691	-0.2829	-1.5091	-1.1356
2013-01-02	1.2121	-0.1732	0.1192	-1.0442
2013-01-03	-0.8618	-2.1046	-0.4949	1.0718
2013-01-04	0.7216	-0.7068	-1.0396	0.2719
2013-01-05	-0.425	0.567	0.2762	-1.0874
2013-01-06	-0.6737	0.1136	-1.4784	0.525

df2 = pl.select(
    A=1.0,
    B=pl.date(2013, 1, 2),
    D=np.array([3] * 4, dtype="int32"),
    E=pl.lit(pl.Series(['test', 'train', 'test', 'train'], dtype=pl.Categorical)),
    F=pl.lit("foo"),
)
df2

shape: (4, 5)

A	B	D	E	F
f64	date	i32	cat	str
1.0	2013-01-02	3	"test"	"foo"
1.0	2013-01-02	3	"train"	"foo"
1.0	2013-01-02	3	"test"	"foo"
1.0	2013-01-02	3	"train"	"foo"

df2.dtypes

[Float64, Date, Int32, Categorical(ordering='physical'), String]

df2.schema

Schema([('A', Float64),
        ('B', Date),
        ('D', Int32),
        ('E', Categorical(ordering='physical')),
        ('F', String)])

Viewing data#

df.head(5)

shape: (5, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-01	0.4691	-0.2829	-1.5091	-1.1356
2013-01-02	1.2121	-0.1732	0.1192	-1.0442
2013-01-03	-0.8618	-2.1046	-0.4949	1.0718
2013-01-04	0.7216	-0.7068	-1.0396	0.2719
2013-01-05	-0.425	0.567	0.2762	-1.0874

df.tail(3)

shape: (3, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-04	0.7216	-0.7068	-1.0396	0.2719
2013-01-05	-0.425	0.567	0.2762	-1.0874
2013-01-06	-0.6737	0.1136	-1.4784	0.525

df.get_column('index')

shape: (6,)

index
date
2013-01-01
2013-01-02
2013-01-03
2013-01-04
2013-01-05
2013-01-06

df.columns

['index', 'A', 'B', 'C', 'D']

df.drop('index').to_numpy()

array([[ 0.4691, -0.2829, -1.5091, -1.1356],
       [ 1.2121, -0.1732,  0.1192, -1.0442],
       [-0.8618, -2.1046, -0.4949,  1.0718],
       [ 0.7216, -0.7068, -1.0396,  0.2719],
       [-0.425 ,  0.567 ,  0.2762, -1.0874],
       [-0.6737,  0.1136, -1.4784,  0.525 ]])

df2.dtypes

[Float64, Date, Int32, Categorical(ordering='physical'), String]

df2.schema

Schema([('A', Float64),
        ('B', Date),
        ('D', Int32),
        ('E', Categorical(ordering='physical')),
        ('F', String)])

df2.to_numpy()

array([[1.0, datetime.date(2013, 1, 2), 3, 'test', 'foo'],
       [1.0, datetime.date(2013, 1, 2), 3, 'train', 'foo'],
       [1.0, datetime.date(2013, 1, 2), 3, 'test', 'foo'],
       [1.0, datetime.date(2013, 1, 2), 3, 'train', 'foo']], dtype=object)

df2.to_numpy(structured=True)

array([(1., '2013-01-02', 3, 'test', 'foo'),
       (1., '2013-01-02', 3, 'train', 'foo'),
       (1., '2013-01-02', 3, 'test', 'foo'),
       (1., '2013-01-02', 3, 'train', 'foo')],
      dtype=[('A', '<f8'), ('B', '<M8[D]'), ('D', '<i4'), ('E', 'O'), ('F', '<U3')])

df.describe()

shape: (9, 6)

statistic	index	A	B	C	D
str	str	f64	f64	f64	f64
"count"	"6"	6.0	6.0	6.0	6.0
"null_count"	"0"	0.0	0.0	0.0	0.0
"mean"	"2013-01-03 12:00:00"	0.073717	-0.43115	-0.687767	-0.233083
"std"	null	0.843153	0.922821	0.779885	0.973111
"min"	"2013-01-01"	-0.8618	-2.1046	-1.5091	-1.1356
"25%"	"2013-01-02"	-0.6737	-0.7068	-1.4784	-1.0874
"50%"	"2013-01-04"	0.4691	-0.1732	-0.4949	0.2719
"75%"	"2013-01-05"	0.7216	0.1136	0.1192	0.525
"max"	"2013-01-06"	1.2121	0.567	0.2762	1.0718

df.drop('index').transpose(
    include_header=True, 
    column_names=df.get_column('index').cast(pl.String))

shape: (4, 7)

column	2013-01-01	2013-01-02	2013-01-03	2013-01-04	2013-01-05	2013-01-06
str	f64	f64	f64	f64	f64	f64
"A"	0.4691	1.2121	-0.8618	0.7216	-0.425	-0.6737
"B"	-0.2829	-0.1732	-2.1046	-0.7068	0.567	0.1136
"C"	-1.5091	0.1192	-0.4949	-1.0396	0.2762	-1.4784
"D"	-1.1356	-1.0442	1.0718	0.2719	-1.0874	0.525

df.select('index', *sorted(df.columns[1:], reverse=True))

shape: (6, 5)

index	D	C	B	A
date	f64	f64	f64	f64
2013-01-01	-1.1356	-1.5091	-0.2829	0.4691
2013-01-02	-1.0442	0.1192	-0.1732	1.2121
2013-01-03	1.0718	-0.4949	-2.1046	-0.8618
2013-01-04	0.2719	-1.0396	-0.7068	0.7216
2013-01-05	-1.0874	0.2762	0.567	-0.425
2013-01-06	0.525	-1.4784	0.1136	-0.6737

df.sort('B')

shape: (6, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-03	-0.8618	-2.1046	-0.4949	1.0718
2013-01-04	0.7216	-0.7068	-1.0396	0.2719
2013-01-01	0.4691	-0.2829	-1.5091	-1.1356
2013-01-02	1.2121	-0.1732	0.1192	-1.0442
2013-01-06	-0.6737	0.1136	-1.4784	0.525
2013-01-05	-0.425	0.567	0.2762	-1.0874

Selection#

Getitem ([])#

df['A']

shape: (6,)

A
f64
0.4691
1.2121
-0.8618
0.7216
-0.425
-0.6737

df.slice(0, 3)

shape: (3, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-01	0.4691	-0.2829	-1.5091	-1.1356
2013-01-02	1.2121	-0.1732	0.1192	-1.0442
2013-01-03	-0.8618	-2.1046	-0.4949	1.0718

df.filter(
    pl.col('index').is_between(
        pl.date(2013, 1, 2), 
        pl.date(2013, 1, 4)
    )
)

shape: (3, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-02	1.2121	-0.1732	0.1192	-1.0442
2013-01-03	-0.8618	-2.1046	-0.4949	1.0718
2013-01-04	0.7216	-0.7068	-1.0396	0.2719

Selection by label#

df.filter(pl.col('index') == pl.col('index').first())

shape: (1, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-01	0.4691	-0.2829	-1.5091	-1.1356

df.select('index', 'A', 'B')

shape: (6, 3)

index	A	B
date	f64	f64
2013-01-01	0.4691	-0.2829
2013-01-02	1.2121	-0.1732
2013-01-03	-0.8618	-2.1046
2013-01-04	0.7216	-0.7068
2013-01-05	-0.425	0.567
2013-01-06	-0.6737	0.1136

(df
.filter(
    pl.col('index').is_between(
        pl.date(2013, 1, 2), 
        pl.date(2013, 1, 4)
    )
)
.select('index', 'A', 'B')
)

shape: (3, 3)

index	A	B
date	f64	f64
2013-01-02	1.2121	-0.1732
2013-01-03	-0.8618	-2.1046
2013-01-04	0.7216	-0.7068

(
df
.filter(pl.col('index') == pl.col('index').first())
.select('A')
.item()
)

0.4691

df.row(0, named=True)['A']

0.4691

Selection by position#

df.slice(3, 1)

shape: (1, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-04	0.7216	-0.7068	-1.0396	0.2719

df.select(pl.nth(range(3)).slice(3, 2))

shape: (2, 3)

index	A	B
date	f64	f64
2013-01-04	0.7216	-0.7068
2013-01-05	-0.425	0.567

df.select(pl.nth([0, 1, 3]).gather([1, 2, 4]))

shape: (3, 3)

index	A	C
date	f64	f64
2013-01-02	1.2121	0.1192
2013-01-03	-0.8618	-0.4949
2013-01-05	-0.425	0.2762

df.slice(1, 2)

shape: (2, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-02	1.2121	-0.1732	0.1192	-1.0442
2013-01-03	-0.8618	-2.1046	-0.4949	1.0718

df.select(pl.nth(0), pl.nth(range(2, 4)))

shape: (6, 3)

index	B	C
date	f64	f64
2013-01-01	-0.2829	-1.5091
2013-01-02	-0.1732	0.1192
2013-01-03	-2.1046	-0.4949
2013-01-04	-0.7068	-1.0396
2013-01-05	0.567	0.2762
2013-01-06	0.1136	-1.4784

df.row(1)[2]

-0.1732

Boolean indexing#

df.filter(pl.col('A') > 0)

shape: (3, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-01	0.4691	-0.2829	-1.5091	-1.1356
2013-01-02	1.2121	-0.1732	0.1192	-1.0442
2013-01-04	0.7216	-0.7068	-1.0396	0.2719

df.select(
    pl.col('index'),
    pl.when(pl.exclude('index') > 0)
      .then(pl.exclude('index'))
      .otherwise(None)
)

shape: (6, 5)

index	A	B	C	D
date	f64	f64	f64	f64
2013-01-01	0.4691	null	null	null
2013-01-02	1.2121	null	0.1192	null
2013-01-03	null	null	null	1.0718
2013-01-04	0.7216	null	null	0.2719
2013-01-05	null	0.567	0.2762	null
2013-01-06	null	0.1136	null	0.525

df2 = df.with_columns(
    E=pl.Series(['one', 'one', 'two', 'three', 'four', 'three'])
)
df2

shape: (6, 6)

index	A	B	C	D	E
date	f64	f64	f64	f64	str
2013-01-01	0.4691	-0.2829	-1.5091	-1.1356	"one"
2013-01-02	1.2121	-0.1732	0.1192	-1.0442	"one"
2013-01-03	-0.8618	-2.1046	-0.4949	1.0718	"two"
2013-01-04	0.7216	-0.7068	-1.0396	0.2719	"three"
2013-01-05	-0.425	0.567	0.2762	-1.0874	"four"
2013-01-06	-0.6737	0.1136	-1.4784	0.525	"three"

df2.filter(pl.col('E').is_in(['two', 'four']))

shape: (2, 6)

index	A	B	C	D	E
date	f64	f64	f64	f64	str
2013-01-03	-0.8618	-2.1046	-0.4949	1.0718	"two"
2013-01-05	-0.425	0.567	0.2762	-1.0874	"four"

Setting#

# s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
s1 = pl.select(
    index=pl.date_range(pl.date(2013, 1, 2), pl.date(2013, 1, 7)),
    value=pl.Series([1, 2, 3, 4, 5, 6])
)
s1

shape: (6, 2)

index	value
date	i64
2013-01-02	1
2013-01-03	2
2013-01-04	3
2013-01-05	4
2013-01-06	5
2013-01-07	6

# df["F"] = s1
df = df.join(s1.select('index', F='value'), on='index', how='left')

# df.at[dates[0], "A"] = 0
df = df.with_columns(
    pl.when(pl.col('index') == dates[0])
      .then(0)
      .otherwise('A')
      .alias('A')
)

# df.iat[0, 1] = 0
df = df.with_columns(
    pl.when(pl.int_range(pl.len()) == 0)
      .then(0)
      .otherwise(pl.nth(2))
      .name.keep()
)

# df.loc[:, "D"] = np.array([5] * len(df))
df = df.with_columns(D=5)
df

shape: (6, 6)

index	A	B	C	D	F
date	f64	f64	f64	i32	i64
2013-01-01	0.0	0.0	-1.5091	5	null
2013-01-02	1.2121	-0.1732	0.1192	5	1
2013-01-03	-0.8618	-2.1046	-0.4949	5	2
2013-01-04	0.7216	-0.7068	-1.0396	5	3
2013-01-05	-0.425	0.567	0.2762	5	4
2013-01-06	-0.6737	0.1136	-1.4784	5	5

# df2[df2 > 0] = -df2
df2 = df.select(pl.all())

others = pl.exclude('index')
df2 = df2.select(
    'index',
    pl.when(others > 0)
      .then(-others)
      .otherwise(others)
)
df2

shape: (6, 6)

index	A	B	C	D	F
date	f64	f64	f64	i32	i64
2013-01-01	0.0	0.0	-1.5091	-5	null
2013-01-02	-1.2121	-0.1732	-0.1192	-5	-1
2013-01-03	-0.8618	-2.1046	-0.4949	-5	-2
2013-01-04	-0.7216	-0.7068	-1.0396	-5	-3
2013-01-05	-0.425	-0.567	-0.2762	-5	-4
2013-01-06	-0.6737	-0.1136	-1.4784	-5	-5

Missing data#

df1 = df.select(
    pl.all().slice(0, 4),
    E=pl.Series([1.0, 1.0, None, None])
)
df1

shape: (4, 7)

index	A	B	C	D	F	E
date	f64	f64	f64	i32	i64	f64
2013-01-01	0.0	0.0	-1.5091	5	null	1.0
2013-01-02	1.2121	-0.1732	0.1192	5	1	1.0
2013-01-03	-0.8618	-2.1046	-0.4949	5	2	null
2013-01-04	0.7216	-0.7068	-1.0396	5	3	null

df1.drop_nulls()

shape: (1, 7)

index	A	B	C	D	F	E
date	f64	f64	f64	i32	i64	f64
2013-01-02	1.2121	-0.1732	0.1192	5	1	1.0

df1.fill_null(5)

shape: (4, 7)

index	A	B	C	D	F	E
date	f64	f64	f64	i32	i64	f64
2013-01-01	0.0	0.0	-1.5091	5	5	1.0
2013-01-02	1.2121	-0.1732	0.1192	5	1	1.0
2013-01-03	-0.8618	-2.1046	-0.4949	5	2	5.0
2013-01-04	0.7216	-0.7068	-1.0396	5	3	5.0

df1.select(
    'index',
    pl.exclude('index').is_null()
)

shape: (4, 7)

index	A	B	C	D	F	E
date	bool	bool	bool	bool	bool	bool
2013-01-01	false	false	false	false	true	false
2013-01-02	false	false	false	false	false	false
2013-01-03	false	false	false	false	false	true
2013-01-04	false	false	false	false	false	true

Operations#

Stats#

df.select(pl.exclude('index').mean())

shape: (1, 5)

A	B	C	D	F
f64	f64	f64	f64	f64
-0.004467	-0.384	-0.687767	5.0	3.0

df.select(
    'index',
    pl.mean_horizontal(pl.exclude('index'))
)

shape: (6, 2)

index	A
date	f64
2013-01-01	0.872725
2013-01-02	1.43162
2013-01-03	0.70774
2013-01-04	1.39504
2013-01-05	1.88364
2013-01-06	1.5923

s = pl.select(
    index=df.get_column('index'),
    value=pl.Series([1, 3, 5, None, 6, 8]).shift(2)
)
s

shape: (6, 2)

index	value
date	i64
2013-01-01	null
2013-01-02	null
2013-01-03	1
2013-01-04	3
2013-01-05	5
2013-01-06	null

df_l, df_r = pl.align_frames(df, s, on='index')
df_l.select(
    'index',
    pl.exclude('index') - df_r.get_column('value')
)

shape: (6, 6)

index	A	B	C	D	F
date	f64	f64	f64	i64	i64
2013-01-01	null	null	null	null	null
2013-01-02	null	null	null	null	null
2013-01-03	-1.8618	-3.1046	-1.4949	4	1
2013-01-04	-2.2784	-3.7068	-4.0396	2	0
2013-01-05	-5.425	-4.433	-4.7238	0	-1
2013-01-06	null	null	null	null	null

User defined functions#

# df.agg(lambda x: np.mean(x) * 5.6)
df.select(
    pl.exclude('index').mean() * 5.6
)

shape: (1, 5)

A	B	C	D	F
f64	f64	f64	f64	f64
-0.025013	-2.1504	-3.851493	28.0	16.8

# df.transform(lambda x: x * 101.2)
df.select(
    pl.exclude('index') * 101.2
)

shape: (6, 5)

A	B	C	D	F
f64	f64	f64	f64	f64
0.0	0.0	-152.72092	506.0	null
122.66452	-17.52784	12.06304	506.0	101.2
-87.21416	-212.98552	-50.08388	506.0	202.4
73.02592	-71.52816	-105.20752	506.0	303.6
-43.01	57.3804	27.95144	506.0	404.8
-68.17844	11.49632	-149.61408	506.0	506.0

Value Counts#

s = pl.Series(np.random.randint(0, 7, size=10))
s.value_counts()

shape: (5, 2)

	count
i32	u32
6	2
4	1
0	1
5	3
2	3

String Methods#

s = pl.Series(["A", "B", "C", "Aaba", "Baca", None, "CABA", "dog", "cat"])

s.str.to_lowercase()

shape: (9,)


str
"a"
"b"
"c"
"aaba"
"baca"
null
"caba"
"dog"
"cat"

Merge#

Concat#

data = """
index     0         1         2         3
0 -0.548702  1.467327 -1.015962 -0.483075
1  1.637550 -1.217659 -0.291519 -1.745505
2 -0.263952  0.991460 -0.919069  0.266046
3 -0.709661  1.669052  1.037882 -1.705775
4 -0.919854 -0.042379  1.247642 -0.009920
5  0.290213  0.495767  0.362949  1.548106
6 -1.131345 -0.089329  0.337863 -0.945867
7 -0.932132  1.956030  0.017587 -0.016692
8 -0.575247  0.254161 -1.143704  0.215897
9  1.193555 -0.077118 -0.408530 -0.862495
"""
df = to_dataframe(data)
df

shape: (10, 5)

index	0	1	2	3
i64	f64	f64	f64	f64
0	-0.548702	1.467327	-1.015962	-0.483075
1	1.63755	-1.217659	-0.291519	-1.745505
2	-0.263952	0.99146	-0.919069	0.266046
3	-0.709661	1.669052	1.037882	-1.705775
4	-0.919854	-0.042379	1.247642	-0.00992
5	0.290213	0.495767	0.362949	1.548106
6	-1.131345	-0.089329	0.337863	-0.945867
7	-0.932132	1.95603	0.017587	-0.016692
8	-0.575247	0.254161	-1.143704	0.215897
9	1.193555	-0.077118	-0.40853	-0.862495

pl.concat([df.slice(0, 3), df.slice(3, 4), df.slice(7)])

shape: (10, 5)

index	0	1	2	3
i64	f64	f64	f64	f64
0	-0.548702	1.467327	-1.015962	-0.483075
1	1.63755	-1.217659	-0.291519	-1.745505
2	-0.263952	0.99146	-0.919069	0.266046
3	-0.709661	1.669052	1.037882	-1.705775
4	-0.919854	-0.042379	1.247642	-0.00992
5	0.290213	0.495767	0.362949	1.548106
6	-1.131345	-0.089329	0.337863	-0.945867
7	-0.932132	1.95603	0.017587	-0.016692
8	-0.575247	0.254161	-1.143704	0.215897
9	1.193555	-0.077118	-0.40853	-0.862495

join#

left = pl.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pl.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
row(left, right)

shape: (2, 2)

key	lval
str	i64
"foo"	1
"foo"	2

shape: (2, 2)

key	rval
str	i64
"foo"	4
"foo"	5

left.join(right, on='key')

shape: (4, 3)

key	lval	rval
str	i64	i64
"foo"	1	4
"foo"	2	4
"foo"	1	5
"foo"	2	5

left = pl.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})
right = pl.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]})
row(left, right)

shape: (2, 2)

key	lval
str	i64
"foo"	1
"bar"	2

shape: (2, 2)

key	rval
str	i64
"foo"	4
"bar"	5

left.join(right, on='key')

shape: (2, 3)

key	lval	rval
str	i64	i64
"foo"	1	4
"bar"	2	5

Grouping#

data = """
A      B         C         D
foo    one  1.346061 -1.577585
bar    one  1.511763  0.396823
foo    two  1.627081 -0.105381
bar  three -0.990582 -0.532532
foo    two -0.441652  1.453749
bar    two  1.211526  1.208843
foo    one  0.268520 -0.080952
foo  three  0.024580 -0.264610
"""
df = to_dataframe(data)

# df.groupby("A")[["C", "D"]].sum()
df.group_by('A').agg(
    pl.col('C', 'D').sum()
)

shape: (2, 3)

A	C	D
str	f64	f64
"foo"	2.82459	-0.574779
"bar"	1.732707	1.073134

# df.groupby(["A", "B"]).sum()
(
df
.group_by('A', 'B')
.agg(pl.all().sum())
.sort('A', 'B')
)

shape: (6, 4)

A	B	C	D
str	str	f64	f64
"bar"	"one"	1.511763	0.396823
"bar"	"three"	-0.990582	-0.532532
"bar"	"two"	1.211526	1.208843
"foo"	"one"	1.614581	-1.658537
"foo"	"three"	0.02458	-0.26461
"foo"	"two"	1.185429	1.348368

Reshaping#

Stack#

data = '''
first second         A         B
bar   one    -0.727965 -0.589346
bar   two     0.339969 -0.693205
baz   one    -0.339355  0.593616
baz   two     0.884345  1.591431
'''
df2 = to_dataframe(data)
df2

shape: (4, 4)

first	second	A	B
str	str	f64	f64
"bar"	"one"	-0.727965	-0.589346
"bar"	"two"	0.339969	-0.693205
"baz"	"one"	-0.339355	0.593616
"baz"	"two"	0.884345	1.591431

stacked = df2.unpivot(['A', 'B'], index=['first', 'second'])
stacked

shape: (8, 4)

first	second	variable	value
str	str	str	f64
"bar"	"one"	"A"	-0.727965
"bar"	"two"	"A"	0.339969
"baz"	"one"	"A"	-0.339355
"baz"	"two"	"A"	0.884345
"bar"	"one"	"B"	-0.589346
"bar"	"two"	"B"	-0.693205
"baz"	"one"	"B"	0.593616
"baz"	"two"	"B"	1.591431

stacked.pivot('variable', index=['first', 'second'])

shape: (4, 4)

first	second	A	B
str	str	f64	f64
"bar"	"one"	-0.727965	-0.589346
"bar"	"two"	0.339969	-0.693205
"baz"	"one"	-0.339355	0.593616
"baz"	"two"	0.884345	1.591431

stacked.pivot('second', index=['first', 'variable'])

shape: (4, 4)

first	variable	one	two
str	str	f64	f64
"bar"	"A"	-0.727965	0.339969
"baz"	"A"	-0.339355	0.884345
"bar"	"B"	-0.589346	-0.693205
"baz"	"B"	0.593616	1.591431

stacked.pivot('first', index=['second', 'variable'])

shape: (4, 4)

second	variable	bar	baz
str	str	f64	f64
"one"	"A"	-0.727965	-0.339355
"two"	"A"	0.339969	0.884345
"one"	"B"	-0.589346	0.593616
"two"	"B"	-0.693205	1.591431

Pivot tables#

data = '''
index A  B    C         D         E
   one  A  foo -1.202872  0.047609
   one  B  foo -1.814470 -0.136473
   two  C  foo  1.018601 -0.561757
 three  A  bar -0.595447 -1.623033
   one  B  bar  1.395433  0.029399
   one  C  bar -0.392670 -0.542108
   two  A  foo  0.007207  0.282696
 three  B  foo  1.928123 -0.087302
   one  C  foo -0.055224 -1.575170
   one  A  bar  2.395985  1.771208
  two  B  bar  1.552825  0.816482
three  C  bar  0.166599  1.100230
'''
df = to_dataframe(data)

df.pivot('C', index=['A', 'B'], values="D").sort('A', 'B')

shape: (9, 4)

A	B	foo	bar
str	str	f64	f64
"one"	"A"	-1.202872	2.395985
"one"	"B"	-1.81447	1.395433
"one"	"C"	-0.055224	-0.39267
"three"	"A"	null	-0.595447
"three"	"B"	1.928123	null
"three"	"C"	null	0.166599
"two"	"A"	0.007207	null
"two"	"B"	null	1.552825
"two"	"C"	1.018601	null

Time series#

n = 1000
rng = pl.datetime(2012, 1, 1) + pl.duration(seconds=pl.int_range(0, n))
ts = pl.select(
    index=rng,
    value=np.random.randint(0, 500, n)
)
(
ts
.group_by(pl.col('index').dt.truncate('5m'), maintain_order=True)
.agg(pl.col('value').sum())
)

shape: (4, 2)

index	value
datetime[μs]	i32
2012-01-01 00:00:00	72130
2012-01-01 00:05:00	76640
2012-01-01 00:10:00	71218
2012-01-01 00:15:00	25874

rng = pl.datetime(2012, 3, 6) + pl.duration(days=pl.int_range(0, 5))
ts = pl.select(
    index=rng,
    value=np.random.randn(5)
)
ts

shape: (5, 2)

index	value
datetime[μs]	f64
2012-03-06 00:00:00	-0.037589
2012-03-07 00:00:00	-0.422765
2012-03-08 00:00:00	0.148907
2012-03-09 00:00:00	-0.02686
2012-03-10 00:00:00	0.813454

ts_utc = ts.with_columns(
    pl.col('index').dt.replace_time_zone("UTC")
)

ts_utc.with_columns(
    pl.col('index').dt.convert_time_zone('US/Eastern')
)

shape: (5, 2)

index	value
datetime[μs, US/Eastern]	f64
2012-03-05 19:00:00 EST	-0.037589
2012-03-06 19:00:00 EST	-0.422765
2012-03-07 19:00:00 EST	0.148907
2012-03-08 19:00:00 EST	-0.02686
2012-03-09 19:00:00 EST	0.813454

ts.with_columns(
    pl.col('index').dt.add_business_days(5, roll="backward")
)

shape: (5, 2)

index	value
datetime[μs]	f64
2012-03-13 00:00:00	-0.037589
2012-03-14 00:00:00	-0.422765
2012-03-15 00:00:00	0.148907
2012-03-16 00:00:00	-0.02686
2012-03-16 00:00:00	0.813454

Categoricals#

df = pl.DataFrame({
    "id": [1, 2, 3, 4, 5, 6], 
    "raw_grade": ["a", "b", "b", "a", "a", "e"]
})

df = df.with_columns(
    grade=pl.col('raw_grade').cast(pl.Categorical)
)
df

shape: (6, 3)

id	raw_grade	grade
i64	str	cat
1	"a"	"a"
2	"b"	"b"
3	"b"	"b"
4	"a"	"a"
5	"a"	"a"
6	"e"	"e"

df = df.with_columns(
    pl.col('grade')
      .cast(pl.String)
      .replace(dict(a='very good', b='good', e="very bad"))
      .cast(pl.Categorical)
)

grades = pl.Enum(["very bad", "bad", "medium", "good", "very good"])
df = df.with_columns(
    pl.col('grade').cast(grades)
)

df.sort('grade')

shape: (6, 3)

id	raw_grade	grade
i64	str	enum
6	"e"	"very bad"
2	"b"	"good"
3	"b"	"good"
1	"a"	"very good"
4	"a"	"very good"
5	"a"	"very good"

# df.groupby("grade", observed=False).size()
(
df
.group_by('grade')
.agg(pl.len())
.join(pl.select(grade=grades.categories.cast(grades)), on='grade', how='right')
.with_columns(pl.col('len').fill_null(0))
)

shape: (5, 2)

len	grade
u32	enum
1	"very bad"
0	"bad"
0	"medium"
2	"good"
3	"very good"

Plotting#

import hvplot.polars

ts = pl.select(
    index=pl.datetime(2000, 1, 1) + pl.duration(days=pl.int_range(1000)),
    value=np.random.randn(1000)
)
ts = ts.with_columns(pl.col('value').cum_sum())

ts.hvplot.line(x='index')

df = pl.DataFrame(
    np.random.randn(1000, 4), schema=['A', 'B', 'C', 'D']
)
df = df.with_columns(
    ts['index'],
    pl.all().cum_sum()
)
df.hvplot.line(x='index')

Importing and exporting data#

CSV#

df = pl.DataFrame(np.random.randint(0, 5, (10, 5)))
df.write_csv('data/pandas_foo.csv')

pl.read_csv('data/pandas_foo.csv')

shape: (10, 5)

column_0	column_1	column_2	column_3	column_4
i64	i64	i64	i64	i64
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

Parquet#

df.write_parquet('data/pandas_foo.parquet')

pl.read_parquet('data/pandas_foo.parquet')

shape: (10, 5)

column_0	column_1	column_2	column_3	column_4
i32	i32	i32	i32	i32
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

Excel#

df.write_excel('data/pandas_foo.xlsx', 'Sheet1')

<xlsxwriter.workbook.Workbook at 0x17dbc86c950>

pl.read_excel('data/pandas_foo.xlsx', sheet_name='Sheet1')

shape: (10, 5)

column_0	column_1	column_2	column_3	column_4
i64	i64	i64	i64	i64
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

Gotchas#

if pl.Series([False, True, False]):
     print("I was true")

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[199], line 1
----> 1 if pl.Series([False, True, False]):
      2      print("I was true")

File C:\mamba\envs\py312\Lib\site-packages\polars\series\series.py:636, in Series.__bool__(self)
    626 def __bool__(self) -> NoReturn:
    627     msg = (
    628         "the truth value of a Series is ambiguous"
    629         "\n\n"
   (...)
    634         "- instead of `s in [y, z]`, use `s.is_in([y, z])`\n"
    635     )
--> 636     raise TypeError(msg)

TypeError: the truth value of a Series is ambiguous

Here are some things you might want to try:
- instead of `if s`, use `if not s.is_empty()`
- instead of `s1 and s2`, use `s1 & s2`
- instead of `s1 or s2`, use `s1 | s2`
- instead of `s in [y, z]`, use `s.is_in([y, z])`

column_0	column_1	column_2	column_3	column_4
i64	i64	i64	i64	i64
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

column_0	column_1	column_2	column_3	column_4
i32	i32	i32	i32	i32
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

column_0	column_1	column_2	column_3	column_4
i64	i64	i64	i64	i64
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

column_0	column_1	column_2	column_3	column_4
i64	i64	i64	i64	i64
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

column_0	column_1	column_2	column_3	column_4
i32	i32	i32	i32	i32
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

column_0	column_1	column_2	column_3	column_4
i64	i64	i64	i64	i64
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

10 minutes to pandas

Contents

10 minutes to pandas#

Object creation#

Viewing data#

Selection#

Getitem ([])#

Selection by label#

Selection by position#

Boolean indexing#

Setting#

Missing data#

Operations#

Stats#

User defined functions#

Value Counts#

String Methods#

Merge#

Concat#

join#

Grouping#

Reshaping#

Stack#

Pivot tables#

Time series#

Categoricals#

Plotting#

Importing and exporting data#

CSV#

Parquet#

Excel#

Gotchas#

column_0	column_1	column_2	column_3	column_4
i64	i64	i64	i64	i64
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

column_0	column_1	column_2	column_3	column_4
i32	i32	i32	i32	i32
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1

column_0	column_1	column_2	column_3	column_4
i64	i64	i64	i64	i64
1	2	0	0	1
3	1	0	4	1
3	1	0	1	2
2	1	1	0	0
0	3	0	4	4
4	4	4	3	0
2	0	2	2	4
0	2	1	3	3
4	0	3	2	4
2	1	4	1	1