10 minutes to pandas#

import numpy as np
import polars as pl
from helper.jupyter import row
def to_dataframe(data):
    import re
    from io import StringIO
    data = re.sub(r'[ \t]+', ',', data.strip())
    return pl.read_csv(StringIO(data))

Object creation#

s = pl.Series([1, 3, 5, np.nan, 6, 8], strict=False)
s2 = pl.Series([1, 3, 5, None, 6, 8])
row(s, s2)
shape: (6,)
f64
1.0
3.0
5.0
NaN
6.0
8.0
shape: (6,)
i64
1
3
5
null
6
8
dates = pl.date_range(
    pl.date(2013, 1, 1), 
    pl.date(2013, 1, 6), 
    interval='1d', eager=True).alias('index')
dates
shape: (6,)
index
date
2013-01-01
2013-01-02
2013-01-03
2013-01-04
2013-01-05
2013-01-06
df = pl.DataFrame(np.random.randn(6, 4), schema=list("ABCD"))
df.insert_column(0, dates)
df
shape: (6, 5)
indexABCD
datef64f64f64f64
2013-01-01-1.281556-0.976005-0.449651.508403
2013-01-022.2995780.450922-0.758744-0.762308
2013-01-030.489373-0.1240090.597175-0.365254
2013-01-04-0.6892020.0405630.8834050.400285
2013-01-050.9721641.5965521.191523-1.907236
2013-01-060.8309180.2114870.910451.73558
arr = np.array([[ 0.4691, -0.2829, -1.5091, -1.1356],
       [ 1.2121, -0.1732,  0.1192, -1.0442],
       [-0.8618, -2.1046, -0.4949,  1.0718],
       [ 0.7216, -0.7068, -1.0396,  0.2719],
       [-0.425 ,  0.567 ,  0.2762, -1.0874],
       [-0.6737,  0.1136, -1.4784,  0.525 ]])
df = pl.DataFrame(arr, schema=list("ABCD"))
df.insert_column(0, dates)
shape: (6, 5)
indexABCD
datef64f64f64f64
2013-01-010.4691-0.2829-1.5091-1.1356
2013-01-021.2121-0.17320.1192-1.0442
2013-01-03-0.8618-2.1046-0.49491.0718
2013-01-040.7216-0.7068-1.03960.2719
2013-01-05-0.4250.5670.2762-1.0874
2013-01-06-0.67370.1136-1.47840.525
df2 = pl.select(
    A=1.0,
    B=pl.date(2013, 1, 2),
    D=np.array([3] * 4, dtype="int32"),
    E=pl.lit(pl.Series(['test', 'train', 'test', 'train'], dtype=pl.Categorical)),
    F=pl.lit("foo"),
)
df2
shape: (4, 5)
ABDEF
f64datei32catstr
1.02013-01-023"test""foo"
1.02013-01-023"train""foo"
1.02013-01-023"test""foo"
1.02013-01-023"train""foo"
df2.dtypes
[Float64, Date, Int32, Categorical(ordering='physical'), String]
df2.schema
Schema([('A', Float64),
        ('B', Date),
        ('D', Int32),
        ('E', Categorical(ordering='physical')),
        ('F', String)])

Viewing data#

df.head(5)
shape: (5, 5)
indexABCD
datef64f64f64f64
2013-01-010.4691-0.2829-1.5091-1.1356
2013-01-021.2121-0.17320.1192-1.0442
2013-01-03-0.8618-2.1046-0.49491.0718
2013-01-040.7216-0.7068-1.03960.2719
2013-01-05-0.4250.5670.2762-1.0874
df.tail(3)
shape: (3, 5)
indexABCD
datef64f64f64f64
2013-01-040.7216-0.7068-1.03960.2719
2013-01-05-0.4250.5670.2762-1.0874
2013-01-06-0.67370.1136-1.47840.525
df.get_column('index')
shape: (6,)
index
date
2013-01-01
2013-01-02
2013-01-03
2013-01-04
2013-01-05
2013-01-06
df.columns
['index', 'A', 'B', 'C', 'D']
df.drop('index').to_numpy()
array([[ 0.4691, -0.2829, -1.5091, -1.1356],
       [ 1.2121, -0.1732,  0.1192, -1.0442],
       [-0.8618, -2.1046, -0.4949,  1.0718],
       [ 0.7216, -0.7068, -1.0396,  0.2719],
       [-0.425 ,  0.567 ,  0.2762, -1.0874],
       [-0.6737,  0.1136, -1.4784,  0.525 ]])
df2.dtypes
[Float64, Date, Int32, Categorical(ordering='physical'), String]
df2.schema
Schema([('A', Float64),
        ('B', Date),
        ('D', Int32),
        ('E', Categorical(ordering='physical')),
        ('F', String)])
df2.to_numpy()
array([[1.0, datetime.date(2013, 1, 2), 3, 'test', 'foo'],
       [1.0, datetime.date(2013, 1, 2), 3, 'train', 'foo'],
       [1.0, datetime.date(2013, 1, 2), 3, 'test', 'foo'],
       [1.0, datetime.date(2013, 1, 2), 3, 'train', 'foo']], dtype=object)
df2.to_numpy(structured=True)
array([(1., '2013-01-02', 3, 'test', 'foo'),
       (1., '2013-01-02', 3, 'train', 'foo'),
       (1., '2013-01-02', 3, 'test', 'foo'),
       (1., '2013-01-02', 3, 'train', 'foo')],
      dtype=[('A', '<f8'), ('B', '<M8[D]'), ('D', '<i4'), ('E', 'O'), ('F', '<U3')])
df.describe()
shape: (9, 6)
statisticindexABCD
strstrf64f64f64f64
"count""6"6.06.06.06.0
"null_count""0"0.00.00.00.0
"mean""2013-01-03 12:00:00"0.073717-0.43115-0.687767-0.233083
"std"null0.8431530.9228210.7798850.973111
"min""2013-01-01"-0.8618-2.1046-1.5091-1.1356
"25%""2013-01-02"-0.6737-0.7068-1.4784-1.0874
"50%""2013-01-04"0.4691-0.1732-0.49490.2719
"75%""2013-01-05"0.72160.11360.11920.525
"max""2013-01-06"1.21210.5670.27621.0718
df.drop('index').transpose(
    include_header=True, 
    column_names=df.get_column('index').cast(pl.String))
shape: (4, 7)
column2013-01-012013-01-022013-01-032013-01-042013-01-052013-01-06
strf64f64f64f64f64f64
"A"0.46911.2121-0.86180.7216-0.425-0.6737
"B"-0.2829-0.1732-2.1046-0.70680.5670.1136
"C"-1.50910.1192-0.4949-1.03960.2762-1.4784
"D"-1.1356-1.04421.07180.2719-1.08740.525
df.select('index', *sorted(df.columns[1:], reverse=True))
shape: (6, 5)
indexDCBA
datef64f64f64f64
2013-01-01-1.1356-1.5091-0.28290.4691
2013-01-02-1.04420.1192-0.17321.2121
2013-01-031.0718-0.4949-2.1046-0.8618
2013-01-040.2719-1.0396-0.70680.7216
2013-01-05-1.08740.27620.567-0.425
2013-01-060.525-1.47840.1136-0.6737
df.sort('B')
shape: (6, 5)
indexABCD
datef64f64f64f64
2013-01-03-0.8618-2.1046-0.49491.0718
2013-01-040.7216-0.7068-1.03960.2719
2013-01-010.4691-0.2829-1.5091-1.1356
2013-01-021.2121-0.17320.1192-1.0442
2013-01-06-0.67370.1136-1.47840.525
2013-01-05-0.4250.5670.2762-1.0874

Selection#

Getitem ([])#

df['A']
shape: (6,)
A
f64
0.4691
1.2121
-0.8618
0.7216
-0.425
-0.6737
df.slice(0, 3)
shape: (3, 5)
indexABCD
datef64f64f64f64
2013-01-010.4691-0.2829-1.5091-1.1356
2013-01-021.2121-0.17320.1192-1.0442
2013-01-03-0.8618-2.1046-0.49491.0718
df.filter(
    pl.col('index').is_between(
        pl.date(2013, 1, 2), 
        pl.date(2013, 1, 4)
    )
)
shape: (3, 5)
indexABCD
datef64f64f64f64
2013-01-021.2121-0.17320.1192-1.0442
2013-01-03-0.8618-2.1046-0.49491.0718
2013-01-040.7216-0.7068-1.03960.2719

Selection by label#

df.filter(pl.col('index') == pl.col('index').first())
shape: (1, 5)
indexABCD
datef64f64f64f64
2013-01-010.4691-0.2829-1.5091-1.1356
df.select('index', 'A', 'B')
shape: (6, 3)
indexAB
datef64f64
2013-01-010.4691-0.2829
2013-01-021.2121-0.1732
2013-01-03-0.8618-2.1046
2013-01-040.7216-0.7068
2013-01-05-0.4250.567
2013-01-06-0.67370.1136
(df
.filter(
    pl.col('index').is_between(
        pl.date(2013, 1, 2), 
        pl.date(2013, 1, 4)
    )
)
.select('index', 'A', 'B')
)
shape: (3, 3)
indexAB
datef64f64
2013-01-021.2121-0.1732
2013-01-03-0.8618-2.1046
2013-01-040.7216-0.7068
(
df
.filter(pl.col('index') == pl.col('index').first())
.select('A')
.item()
)
0.4691
df.row(0, named=True)['A']
0.4691

Selection by position#

df.slice(3, 1)
shape: (1, 5)
indexABCD
datef64f64f64f64
2013-01-040.7216-0.7068-1.03960.2719
df.select(pl.nth(range(3)).slice(3, 2))
shape: (2, 3)
indexAB
datef64f64
2013-01-040.7216-0.7068
2013-01-05-0.4250.567
df.select(pl.nth([0, 1, 3]).gather([1, 2, 4]))
shape: (3, 3)
indexAC
datef64f64
2013-01-021.21210.1192
2013-01-03-0.8618-0.4949
2013-01-05-0.4250.2762
df.slice(1, 2)
shape: (2, 5)
indexABCD
datef64f64f64f64
2013-01-021.2121-0.17320.1192-1.0442
2013-01-03-0.8618-2.1046-0.49491.0718
df.select(pl.nth(0), pl.nth(range(2, 4)))
shape: (6, 3)
indexBC
datef64f64
2013-01-01-0.2829-1.5091
2013-01-02-0.17320.1192
2013-01-03-2.1046-0.4949
2013-01-04-0.7068-1.0396
2013-01-050.5670.2762
2013-01-060.1136-1.4784
df.row(1)[2]
-0.1732

Boolean indexing#

df.filter(pl.col('A') > 0)
shape: (3, 5)
indexABCD
datef64f64f64f64
2013-01-010.4691-0.2829-1.5091-1.1356
2013-01-021.2121-0.17320.1192-1.0442
2013-01-040.7216-0.7068-1.03960.2719
df.select(
    pl.col('index'),
    pl.when(pl.exclude('index') > 0)
      .then(pl.exclude('index'))
      .otherwise(None)
)
shape: (6, 5)
indexABCD
datef64f64f64f64
2013-01-010.4691nullnullnull
2013-01-021.2121null0.1192null
2013-01-03nullnullnull1.0718
2013-01-040.7216nullnull0.2719
2013-01-05null0.5670.2762null
2013-01-06null0.1136null0.525
df2 = df.with_columns(
    E=pl.Series(['one', 'one', 'two', 'three', 'four', 'three'])
)
df2
shape: (6, 6)
indexABCDE
datef64f64f64f64str
2013-01-010.4691-0.2829-1.5091-1.1356"one"
2013-01-021.2121-0.17320.1192-1.0442"one"
2013-01-03-0.8618-2.1046-0.49491.0718"two"
2013-01-040.7216-0.7068-1.03960.2719"three"
2013-01-05-0.4250.5670.2762-1.0874"four"
2013-01-06-0.67370.1136-1.47840.525"three"
df2.filter(pl.col('E').is_in(['two', 'four']))
shape: (2, 6)
indexABCDE
datef64f64f64f64str
2013-01-03-0.8618-2.1046-0.49491.0718"two"
2013-01-05-0.4250.5670.2762-1.0874"four"

Setting#

# s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
s1 = pl.select(
    index=pl.date_range(pl.date(2013, 1, 2), pl.date(2013, 1, 7)),
    value=pl.Series([1, 2, 3, 4, 5, 6])
)
s1
shape: (6, 2)
indexvalue
datei64
2013-01-021
2013-01-032
2013-01-043
2013-01-054
2013-01-065
2013-01-076
# df["F"] = s1
df = df.join(s1.select('index', F='value'), on='index', how='left')
# df.at[dates[0], "A"] = 0
df = df.with_columns(
    pl.when(pl.col('index') == dates[0])
      .then(0)
      .otherwise('A')
      .alias('A')
)
# df.iat[0, 1] = 0
df = df.with_columns(
    pl.when(pl.int_range(pl.len()) == 0)
      .then(0)
      .otherwise(pl.nth(2))
      .name.keep()
)
# df.loc[:, "D"] = np.array([5] * len(df))
df = df.with_columns(D=5)
df
shape: (6, 6)
indexABCDF
datef64f64f64i32i64
2013-01-010.00.0-1.50915null
2013-01-021.2121-0.17320.119251
2013-01-03-0.8618-2.1046-0.494952
2013-01-040.7216-0.7068-1.039653
2013-01-05-0.4250.5670.276254
2013-01-06-0.67370.1136-1.478455
# df2[df2 > 0] = -df2
df2 = df.select(pl.all())

others = pl.exclude('index')
df2 = df2.select(
    'index',
    pl.when(others > 0)
      .then(-others)
      .otherwise(others)
)
df2
shape: (6, 6)
indexABCDF
datef64f64f64i32i64
2013-01-010.00.0-1.5091-5null
2013-01-02-1.2121-0.1732-0.1192-5-1
2013-01-03-0.8618-2.1046-0.4949-5-2
2013-01-04-0.7216-0.7068-1.0396-5-3
2013-01-05-0.425-0.567-0.2762-5-4
2013-01-06-0.6737-0.1136-1.4784-5-5

Missing data#

df1 = df.select(
    pl.all().slice(0, 4),
    E=pl.Series([1.0, 1.0, None, None])
)
df1
shape: (4, 7)
indexABCDFE
datef64f64f64i32i64f64
2013-01-010.00.0-1.50915null1.0
2013-01-021.2121-0.17320.1192511.0
2013-01-03-0.8618-2.1046-0.494952null
2013-01-040.7216-0.7068-1.039653null
df1.drop_nulls()
shape: (1, 7)
indexABCDFE
datef64f64f64i32i64f64
2013-01-021.2121-0.17320.1192511.0
df1.fill_null(5)
shape: (4, 7)
indexABCDFE
datef64f64f64i32i64f64
2013-01-010.00.0-1.5091551.0
2013-01-021.2121-0.17320.1192511.0
2013-01-03-0.8618-2.1046-0.4949525.0
2013-01-040.7216-0.7068-1.0396535.0
df1.select(
    'index',
    pl.exclude('index').is_null()
)
shape: (4, 7)
indexABCDFE
dateboolboolboolboolboolbool
2013-01-01falsefalsefalsefalsetruefalse
2013-01-02falsefalsefalsefalsefalsefalse
2013-01-03falsefalsefalsefalsefalsetrue
2013-01-04falsefalsefalsefalsefalsetrue

Operations#

Stats#

df.select(pl.exclude('index').mean())
shape: (1, 5)
ABCDF
f64f64f64f64f64
-0.004467-0.384-0.6877675.03.0
df.select(
    'index',
    pl.mean_horizontal(pl.exclude('index'))
)
shape: (6, 2)
indexA
datef64
2013-01-010.872725
2013-01-021.43162
2013-01-030.70774
2013-01-041.39504
2013-01-051.88364
2013-01-061.5923
s = pl.select(
    index=df.get_column('index'),
    value=pl.Series([1, 3, 5, None, 6, 8]).shift(2)
)
s
shape: (6, 2)
indexvalue
datei64
2013-01-01null
2013-01-02null
2013-01-031
2013-01-043
2013-01-055
2013-01-06null
df_l, df_r = pl.align_frames(df, s, on='index')
df_l.select(
    'index',
    pl.exclude('index') - df_r.get_column('value')
)
shape: (6, 6)
indexABCDF
datef64f64f64i64i64
2013-01-01nullnullnullnullnull
2013-01-02nullnullnullnullnull
2013-01-03-1.8618-3.1046-1.494941
2013-01-04-2.2784-3.7068-4.039620
2013-01-05-5.425-4.433-4.72380-1
2013-01-06nullnullnullnullnull

User defined functions#

# df.agg(lambda x: np.mean(x) * 5.6)
df.select(
    pl.exclude('index').mean() * 5.6
)
shape: (1, 5)
ABCDF
f64f64f64f64f64
-0.025013-2.1504-3.85149328.016.8
# df.transform(lambda x: x * 101.2)
df.select(
    pl.exclude('index') * 101.2
)
shape: (6, 5)
ABCDF
f64f64f64f64f64
0.00.0-152.72092506.0null
122.66452-17.5278412.06304506.0101.2
-87.21416-212.98552-50.08388506.0202.4
73.02592-71.52816-105.20752506.0303.6
-43.0157.380427.95144506.0404.8
-68.1784411.49632-149.61408506.0506.0

Value Counts#

s = pl.Series(np.random.randint(0, 7, size=10))
s.value_counts()
shape: (5, 2)
count
i32u32
62
41
01
53
23

String Methods#

s = pl.Series(["A", "B", "C", "Aaba", "Baca", None, "CABA", "dog", "cat"])
s.str.to_lowercase()
shape: (9,)
str
"a"
"b"
"c"
"aaba"
"baca"
null
"caba"
"dog"
"cat"

Merge#

Concat#

data = """
index     0         1         2         3
0 -0.548702  1.467327 -1.015962 -0.483075
1  1.637550 -1.217659 -0.291519 -1.745505
2 -0.263952  0.991460 -0.919069  0.266046
3 -0.709661  1.669052  1.037882 -1.705775
4 -0.919854 -0.042379  1.247642 -0.009920
5  0.290213  0.495767  0.362949  1.548106
6 -1.131345 -0.089329  0.337863 -0.945867
7 -0.932132  1.956030  0.017587 -0.016692
8 -0.575247  0.254161 -1.143704  0.215897
9  1.193555 -0.077118 -0.408530 -0.862495
"""
df = to_dataframe(data)
df
shape: (10, 5)
index0123
i64f64f64f64f64
0-0.5487021.467327-1.015962-0.483075
11.63755-1.217659-0.291519-1.745505
2-0.2639520.99146-0.9190690.266046
3-0.7096611.6690521.037882-1.705775
4-0.919854-0.0423791.247642-0.00992
50.2902130.4957670.3629491.548106
6-1.131345-0.0893290.337863-0.945867
7-0.9321321.956030.017587-0.016692
8-0.5752470.254161-1.1437040.215897
91.193555-0.077118-0.40853-0.862495
pl.concat([df.slice(0, 3), df.slice(3, 4), df.slice(7)])
shape: (10, 5)
index0123
i64f64f64f64f64
0-0.5487021.467327-1.015962-0.483075
11.63755-1.217659-0.291519-1.745505
2-0.2639520.99146-0.9190690.266046
3-0.7096611.6690521.037882-1.705775
4-0.919854-0.0423791.247642-0.00992
50.2902130.4957670.3629491.548106
6-1.131345-0.0893290.337863-0.945867
7-0.9321321.956030.017587-0.016692
8-0.5752470.254161-1.1437040.215897
91.193555-0.077118-0.40853-0.862495

join#

left = pl.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pl.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
row(left, right)
shape: (2, 2)
keylval
stri64
"foo"1
"foo"2
shape: (2, 2)
keyrval
stri64
"foo"4
"foo"5
left.join(right, on='key')
shape: (4, 3)
keylvalrval
stri64i64
"foo"14
"foo"24
"foo"15
"foo"25
left = pl.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})
right = pl.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]})
row(left, right)
shape: (2, 2)
keylval
stri64
"foo"1
"bar"2
shape: (2, 2)
keyrval
stri64
"foo"4
"bar"5
left.join(right, on='key')
shape: (2, 3)
keylvalrval
stri64i64
"foo"14
"bar"25

Grouping#

data = """
A      B         C         D
foo    one  1.346061 -1.577585
bar    one  1.511763  0.396823
foo    two  1.627081 -0.105381
bar  three -0.990582 -0.532532
foo    two -0.441652  1.453749
bar    two  1.211526  1.208843
foo    one  0.268520 -0.080952
foo  three  0.024580 -0.264610
"""
df = to_dataframe(data)
# df.groupby("A")[["C", "D"]].sum()
df.group_by('A').agg(
    pl.col('C', 'D').sum()
)
shape: (2, 3)
ACD
strf64f64
"foo"2.82459-0.574779
"bar"1.7327071.073134
# df.groupby(["A", "B"]).sum()
(
df
.group_by('A', 'B')
.agg(pl.all().sum())
.sort('A', 'B')
)
shape: (6, 4)
ABCD
strstrf64f64
"bar""one"1.5117630.396823
"bar""three"-0.990582-0.532532
"bar""two"1.2115261.208843
"foo""one"1.614581-1.658537
"foo""three"0.02458-0.26461
"foo""two"1.1854291.348368

Reshaping#

Stack#

data = '''
first second         A         B
bar   one    -0.727965 -0.589346
bar   two     0.339969 -0.693205
baz   one    -0.339355  0.593616
baz   two     0.884345  1.591431
'''
df2 = to_dataframe(data)
df2
shape: (4, 4)
firstsecondAB
strstrf64f64
"bar""one"-0.727965-0.589346
"bar""two"0.339969-0.693205
"baz""one"-0.3393550.593616
"baz""two"0.8843451.591431
stacked = df2.unpivot(['A', 'B'], index=['first', 'second'])
stacked
shape: (8, 4)
firstsecondvariablevalue
strstrstrf64
"bar""one""A"-0.727965
"bar""two""A"0.339969
"baz""one""A"-0.339355
"baz""two""A"0.884345
"bar""one""B"-0.589346
"bar""two""B"-0.693205
"baz""one""B"0.593616
"baz""two""B"1.591431
stacked.pivot('variable', index=['first', 'second'])
shape: (4, 4)
firstsecondAB
strstrf64f64
"bar""one"-0.727965-0.589346
"bar""two"0.339969-0.693205
"baz""one"-0.3393550.593616
"baz""two"0.8843451.591431
stacked.pivot('second', index=['first', 'variable'])
shape: (4, 4)
firstvariableonetwo
strstrf64f64
"bar""A"-0.7279650.339969
"baz""A"-0.3393550.884345
"bar""B"-0.589346-0.693205
"baz""B"0.5936161.591431
stacked.pivot('first', index=['second', 'variable'])
shape: (4, 4)
secondvariablebarbaz
strstrf64f64
"one""A"-0.727965-0.339355
"two""A"0.3399690.884345
"one""B"-0.5893460.593616
"two""B"-0.6932051.591431

Pivot tables#

data = '''
index A  B    C         D         E
0     one  A  foo -1.202872  0.047609
1     one  B  foo -1.814470 -0.136473
2     two  C  foo  1.018601 -0.561757
3   three  A  bar -0.595447 -1.623033
4     one  B  bar  1.395433  0.029399
5     one  C  bar -0.392670 -0.542108
6     two  A  foo  0.007207  0.282696
7   three  B  foo  1.928123 -0.087302
8     one  C  foo -0.055224 -1.575170
9     one  A  bar  2.395985  1.771208
10    two  B  bar  1.552825  0.816482
11  three  C  bar  0.166599  1.100230
'''
df = to_dataframe(data)
df.pivot('C', index=['A', 'B'], values="D").sort('A', 'B')
shape: (9, 4)
ABfoobar
strstrf64f64
"one""A"-1.2028722.395985
"one""B"-1.814471.395433
"one""C"-0.055224-0.39267
"three""A"null-0.595447
"three""B"1.928123null
"three""C"null0.166599
"two""A"0.007207null
"two""B"null1.552825
"two""C"1.018601null

Time series#

n = 1000
rng = pl.datetime(2012, 1, 1) + pl.duration(seconds=pl.int_range(0, n))
ts = pl.select(
    index=rng,
    value=np.random.randint(0, 500, n)
)
(
ts
.group_by(pl.col('index').dt.truncate('5m'), maintain_order=True)
.agg(pl.col('value').sum())
)
shape: (4, 2)
indexvalue
datetime[μs]i32
2012-01-01 00:00:0072130
2012-01-01 00:05:0076640
2012-01-01 00:10:0071218
2012-01-01 00:15:0025874
rng = pl.datetime(2012, 3, 6) + pl.duration(days=pl.int_range(0, 5))
ts = pl.select(
    index=rng,
    value=np.random.randn(5)
)
ts
shape: (5, 2)
indexvalue
datetime[μs]f64
2012-03-06 00:00:00-0.037589
2012-03-07 00:00:00-0.422765
2012-03-08 00:00:000.148907
2012-03-09 00:00:00-0.02686
2012-03-10 00:00:000.813454
ts_utc = ts.with_columns(
    pl.col('index').dt.replace_time_zone("UTC")
)
ts_utc.with_columns(
    pl.col('index').dt.convert_time_zone('US/Eastern')
)
shape: (5, 2)
indexvalue
datetime[μs, US/Eastern]f64
2012-03-05 19:00:00 EST-0.037589
2012-03-06 19:00:00 EST-0.422765
2012-03-07 19:00:00 EST0.148907
2012-03-08 19:00:00 EST-0.02686
2012-03-09 19:00:00 EST0.813454
ts.with_columns(
    pl.col('index').dt.add_business_days(5, roll="backward")
)
shape: (5, 2)
indexvalue
datetime[μs]f64
2012-03-13 00:00:00-0.037589
2012-03-14 00:00:00-0.422765
2012-03-15 00:00:000.148907
2012-03-16 00:00:00-0.02686
2012-03-16 00:00:000.813454

Categoricals#

df = pl.DataFrame({
    "id": [1, 2, 3, 4, 5, 6], 
    "raw_grade": ["a", "b", "b", "a", "a", "e"]
})
df = df.with_columns(
    grade=pl.col('raw_grade').cast(pl.Categorical)
)
df
shape: (6, 3)
idraw_gradegrade
i64strcat
1"a""a"
2"b""b"
3"b""b"
4"a""a"
5"a""a"
6"e""e"
df = df.with_columns(
    pl.col('grade')
      .cast(pl.String)
      .replace(dict(a='very good', b='good', e="very bad"))
      .cast(pl.Categorical)
)
grades = pl.Enum(["very bad", "bad", "medium", "good", "very good"])
df = df.with_columns(
    pl.col('grade').cast(grades)
)
df.sort('grade')
shape: (6, 3)
idraw_gradegrade
i64strenum
6"e""very bad"
2"b""good"
3"b""good"
1"a""very good"
4"a""very good"
5"a""very good"
# df.groupby("grade", observed=False).size()
(
df
.group_by('grade')
.agg(pl.len())
.join(pl.select(grade=grades.categories.cast(grades)), on='grade', how='right')
.with_columns(pl.col('len').fill_null(0))
)
shape: (5, 2)
lengrade
u32enum
1"very bad"
0"bad"
0"medium"
2"good"
3"very good"

Plotting#

import hvplot.polars
ts = pl.select(
    index=pl.datetime(2000, 1, 1) + pl.duration(days=pl.int_range(1000)),
    value=np.random.randn(1000)
)
ts = ts.with_columns(pl.col('value').cum_sum())
ts.hvplot.line(x='index')
df = pl.DataFrame(
    np.random.randn(1000, 4), schema=['A', 'B', 'C', 'D']
)
df = df.with_columns(
    ts['index'],
    pl.all().cum_sum()
)
df.hvplot.line(x='index')

Importing and exporting data#

CSV#

df = pl.DataFrame(np.random.randint(0, 5, (10, 5)))
df.write_csv('data/pandas_foo.csv')
pl.read_csv('data/pandas_foo.csv')
shape: (10, 5)
column_0column_1column_2column_3column_4
i64i64i64i64i64
12001
31041
31012
21100
03044
44430
20224
02133
40324
21411

Parquet#

df.write_parquet('data/pandas_foo.parquet')
pl.read_parquet('data/pandas_foo.parquet')
shape: (10, 5)
column_0column_1column_2column_3column_4
i32i32i32i32i32
12001
31041
31012
21100
03044
44430
20224
02133
40324
21411

Excel#

df.write_excel('data/pandas_foo.xlsx', 'Sheet1')
<xlsxwriter.workbook.Workbook at 0x17dbc86c950>
pl.read_excel('data/pandas_foo.xlsx', sheet_name='Sheet1')
shape: (10, 5)
column_0column_1column_2column_3column_4
i64i64i64i64i64
12001
31041
31012
21100
03044
44430
20224
02133
40324
21411

Gotchas#

if pl.Series([False, True, False]):
     print("I was true")
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[199], line 1
----> 1 if pl.Series([False, True, False]):
      2      print("I was true")

File C:\mamba\envs\py312\Lib\site-packages\polars\series\series.py:636, in Series.__bool__(self)
    626 def __bool__(self) -> NoReturn:
    627     msg = (
    628         "the truth value of a Series is ambiguous"
    629         "\n\n"
   (...)
    634         "- instead of `s in [y, z]`, use `s.is_in([y, z])`\n"
    635     )
--> 636     raise TypeError(msg)

TypeError: the truth value of a Series is ambiguous

Here are some things you might want to try:
- instead of `if s`, use `if not s.is_empty()`
- instead of `s1 and s2`, use `s1 & s2`
- instead of `s1 or s2`, use `s1 | s2`
- instead of `s in [y, z]`, use `s.is_in([y, z])`