Intro to data structures

Intro to data structures#

import numpy as np
import polars as pl
from helper.jupyter import row

Series#

In Polars, the Series object has no index, so we need to use a two-column DataFrame to simulate the functions of a Series in Pandas.

s = pl.DataFrame(dict(
    index=["a", "b", "c", "d", "e"],
    value=np.random.randn(5)
))
s

shape: (5, 2)

index	value
str	f64
"a"	-1.163544
"b"	-0.900663
"c"	1.657318
"d"	-1.240945
"e"	0.964103

s['index']

shape: (5,)

index
str
"a"
"b"
"c"
"d"
"e"

pl.Series(np.random.randn(5))

shape: (5,)


f64
0.083327
-1.077013
0.120624
-0.480749
-1.196976

# pd.Series(d)
d = {"b": 1, "a": 0, "c": 2}
pl.DataFrame(list(d.items()), schema=['index', 'value'], orient='row')

shape: (3, 2)

index	value
str	i64
"b"	1
"a"	0
"c"	2

pl.select(
    index=pl.Series(["a", "b", "c", "d", "e"]),
    value=5.0
)

shape: (5, 2)

index	value
str	f64
"a"	5.0
"b"	5.0
"c"	5.0
"d"	5.0
"e"	5.0

Series is ndarray-like#

s['value'][0]

-1.163544306576043

s['value'][:3]

shape: (3,)

value
f64
-1.163544
-0.900663
1.657318

# s[s > s.median()]
s.filter(pl.col('value') > pl.col('value').median())

shape: (2, 2)

index	value
str	f64
"c"	1.657318
"e"	0.964103

# s.iloc[[4, 3, 1]]
s[[4, 3, 1]]

shape: (3, 2)

index	value
str	f64
"e"	0.964103
"d"	-1.240945
"b"	-0.900663

s.with_columns(
    pl.col('value').exp()
)

shape: (5, 2)

index	value
str	f64
"a"	0.312377
"b"	0.4063
"c"	5.245225
"d"	0.289111
"e"	2.622434

s['value'].dtype

Float64

# s.array
s['value'].to_numpy()

array([-1.16354431, -0.90066273,  1.65731812, -1.24094468,  0.96410308])

Series is dict-like#

# s["a"]
s.select(pl.col('value').filter(pl.col('index') == 'a')).item()

-1.163544306576043

# s["e"] = 12.0
s = s.with_columns(
    pl.when(pl.col('index') == 'e')
      .then(12.0)
      .otherwise(pl.col('value'))
      .name.keep()
)

# "e" in s
"e" in s['index']

True

"f" in s['index']

False

Vectorized operations and label alignment with Series#

# s + s
from helper.polars import align_op
align_op(s, s, op=pl.Expr.add)

shape: (5, 2)

index	value
str	f64
"a"	-2.327089
"b"	-1.801325
"c"	3.314636
"d"	-2.481889
"e"	24.0

# s * 2
s.select(
    'index',
    pl.col('value') * 2
)

shape: (5, 2)

index	value
str	f64
"a"	-2.327089
"b"	-1.801325
"c"	3.314636
"d"	-2.481889
"e"	24.0

# np.exp(s)
s.select(
    "index",
    pl.col("value").exp()
)

shape: (5, 2)

index	value
str	f64
"a"	0.312377
"b"	0.4063
"c"	5.245225
"d"	0.289111
"e"	162754.791419

# s.iloc[1:] + s.iloc[:-1]
align_op(
    s.slice(1), 
    s.slice(0, len(s) - 1), 
    op=pl.Expr.add, 
    fill_value=None, 
    how="full")

shape: (5, 2)

index	value
str	f64
"a"	null
"b"	-1.801325
"c"	3.314636
"d"	-2.481889
"e"	null

Name attribute#

s = pl.Series("something", np.random.randn(5))
s

shape: (5,)

something
f64
1.879844
1.553987
-1.190783
-1.452195
-0.553582

s.name

'something'

s2 = s.rename('different')
s2.name

'different'

DataFrame#

s1 = pl.DataFrame(dict(index=["a", "b", "c"], one=[1.0, 2.0, 3.0]))
s2 = pl.DataFrame(dict(index=["a", "b", "c", "d"], two=[1.0, 2.0, 3.0, 4.0]))
df = s1.join(s2, on='index', how='full', coalesce=True)
df

shape: (4, 3)

index	one	two
str	f64	f64
"a"	1.0	1.0
"b"	2.0	2.0
"c"	3.0	3.0
"d"	null	4.0

df['index']

shape: (4,)

index
str
"a"
"b"
"c"
"d"

df.columns

['index', 'one', 'two']

df.drop('index').columns

['one', 'two']

From dict of ndarrays / lists#

d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
pl.DataFrame(d)

shape: (4, 2)

one	two
f64	f64
1.0	4.0
2.0	3.0
3.0	2.0
4.0	1.0

pl.DataFrame(d).insert_column(0, pl.Series('index', ["a", "b", "c", "d"]))

shape: (4, 3)

index	one	two
str	f64	f64
"a"	1.0	4.0
"b"	2.0	3.0
"c"	3.0	2.0
"d"	4.0	1.0

From structured or record array#

data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
pl.DataFrame(data)

shape: (2, 3)

A	B	C
i32	f32	binary
1	2.0	b"Hello"
2	3.0	b"World"

pl.DataFrame(data).insert_column(0, pl.Series('index', ['first', 'second']))

shape: (2, 4)

index	A	B	C
str	i32	f32	binary
"first"	1	2.0	b"Hello"
"second"	2	3.0	b"World"

pl.DataFrame(data).select("C", "A", "B")

shape: (2, 3)

C	A	B
binary	i32	f32
b"Hello"	1	2.0
b"World"	2	3.0

From a list of dicts#

data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
pl.DataFrame(data2)

shape: (2, 3)

a	b	c
i64	i64	i64
1	2	null
5	10	20

pl.DataFrame(data2).insert_column(0, pl.Series('index', ['first', 'second']))

shape: (2, 4)

index	a	b	c
str	i64	i64	i64
"first"	1	2	null
"second"	5	10	20

pl.DataFrame(data2).select('a', 'b')

shape: (2, 2)

a	b
i64	i64
1	2
5	10

From a dict of tuples#

data = {
    ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
    ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
    ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
    ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
    ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
}

dfs = []
for key, value in data.items():
    rows = []
    for key2, value2 in value.items():
        data_row = list(key2) + [value2]
        rows.append(data_row)
    dfs.append(
        pl.DataFrame(rows, orient='row', schema=['index0', 'index1', '-'.join(key)])
    )
    
pl.concat(dfs, how="align")    

shape: (3, 7)

index0	index1	a-b	a-a	a-c	b-a	b-b
str	str	i64	i64	i64	i64	i64
"A"	"B"	1	4	5	8	10
"A"	"C"	2	3	6	7	null
"A"	"D"	null	null	null	null	9

From a Series#

ser = pl.Series('ser', range(3))
pl.DataFrame(ser).insert_column(0, pl.Series('index', list("abc")))

shape: (3, 2)

index	ser
str	i64
"a"	0
"b"	1
"c"	2

From a list of namedtuples#

from collections import namedtuple
Point = namedtuple("Point", "x y")
pl.DataFrame([Point(0, 0), Point(0, 3), (2, 3)])

shape: (3, 2)

x	y
i64	i64
0	0
0	3
2	3

Point3D = namedtuple("Point3D", "x y z")
data = [Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]
pl.DataFrame([p._asdict() for p in data])

shape: (3, 3)

x	y	z
i64	i64	i64
0	0	0
0	3	5
2	3	null

From a list of dataclasses#

from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
pl.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])

shape: (3, 2)

x	y
i64	i64
0	0
0	3
2	3

Alternate constructors#

DataFrame.from_dict#

pl.DataFrame(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))

shape: (3, 2)

A	B
i64	i64
1	4
2	5
3	6

data = dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])
pl.DataFrame(
    list(data.values()), schema=['one', 'two', 'three'], orient='row'
).insert_column(0, pl.Series('index', data.keys()))

shape: (2, 4)

index	one	two	three
str	i64	i64	i64
"A"	1	2	3
"B"	4	5	6

DataFrame.from_records#

data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
pl.DataFrame(data)

shape: (2, 3)

A	B	C
i32	f32	binary
1	2.0	b"Hello"
2	3.0	b"World"

Column selection, addition, deletion#

df['one']

shape: (4,)

one
f64
1.0
2.0
3.0
null

df = df.with_columns(
    three=pl.col('one') * pl.col('two'),
    flag=pl.col('one') > 2
)

# del df["two"]
df = df.drop('two')

#three = df.pop("three")
three = df['three']
df = df.drop('three')
df

shape: (4, 3)

index	one	flag
str	f64	bool
"a"	1.0	false
"b"	2.0	false
"c"	3.0	true
"d"	null	null

df = df.with_columns(foo=pl.lit('bar'))
df

shape: (4, 4)

index	one	flag	foo
str	f64	bool	str
"a"	1.0	false	"bar"
"b"	2.0	false	"bar"
"c"	3.0	true	"bar"
"d"	null	null	"bar"

# df["one_trunc"] = df["one"][:2]
df = df.with_columns(
    one_trunc=pl.col('one').slice(0, 2).append(pl.repeat(None, pl.len() - 2))
)
df

shape: (4, 5)

index	one	flag	foo	one_trunc
str	f64	bool	str	f64
"a"	1.0	false	"bar"	1.0
"b"	2.0	false	"bar"	2.0
"c"	3.0	true	"bar"	null
"d"	null	null	"bar"	null

df.insert_column(1, df["one"].rename('bar'))
df

shape: (4, 6)

index	bar	one	flag	foo	one_trunc
str	f64	f64	bool	str	f64
"a"	1.0	1.0	false	"bar"	1.0
"b"	2.0	2.0	false	"bar"	2.0
"c"	3.0	3.0	true	"bar"	null
"d"	null	null	null	"bar"	null

Assigning new columns in method chains#

iris = pl.read_csv('data/iris.data')
iris.head()

shape: (5, 5)

SepalLength	SepalWidth	PetalLength	PetalWidth	Name
f64	f64	f64	f64	str
5.1	3.5	1.4	0.2	"Iris-setosa"
4.9	3.0	1.4	0.2	"Iris-setosa"
4.7	3.2	1.3	0.2	"Iris-setosa"
4.6	3.1	1.5	0.2	"Iris-setosa"
5.0	3.6	1.4	0.2	"Iris-setosa"

iris.with_columns(
    sepal_ratio=pl.col('SepalWidth') / pl.col('SepalLength')
).head()

shape: (5, 6)

SepalLength	SepalWidth	PetalLength	PetalWidth	Name	sepal_ratio
f64	f64	f64	f64	str	f64
5.1	3.5	1.4	0.2	"Iris-setosa"	0.686275
4.9	3.0	1.4	0.2	"Iris-setosa"	0.612245
4.7	3.2	1.3	0.2	"Iris-setosa"	0.680851
4.6	3.1	1.5	0.2	"Iris-setosa"	0.673913
5.0	3.6	1.4	0.2	"Iris-setosa"	0.72

import hvplot.polars

(
iris
.filter(pl.col('SepalLength') > 5)
.with_columns(
    SepalRatio=pl.col('SepalWidth') / pl.col('SepalLength'),
    PetalRatio=pl.col('PetalWidth') / pl.col('PetalLength')
)
.hvplot.scatter(x='SepalRatio', y='PetalRatio')
)

dfa = pl.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
(
dfa
.with_columns(C = pl.col('A') + pl.col('B'))
.with_columns(D = pl.col('A') + pl.col('C'))
)

shape: (3, 4)

A	B	C	D
i64	i64	i64	i64
1	4	5	6
2	5	7	9
3	6	9	12

Indexing / selection#

# df.loc["b"]
df.filter(pl.col('index') == 'b')

shape: (1, 6)

index	bar	one	flag	foo	one_trunc
str	f64	f64	bool	str	f64
"b"	2.0	2.0	false	"bar"	2.0

# df.iloc[2]
df.slice(2, 1)

shape: (1, 6)

index	bar	one	flag	foo	one_trunc
str	f64	f64	bool	str	f64
"c"	3.0	3.0	true	"bar"	null

Data alignment and arithmetic#

# df + df2
df = pl.DataFrame(np.random.randn(10, 4), schema=["A", "B", "C", "D"])
df2 = pl.DataFrame(np.random.randn(7, 3), schema=["A", "B", "C"])
align_op(df.with_row_index(), df2.with_row_index(), pl.Expr.add, fill_value=None)

shape: (10, 5)

index	A	B	C	D
u32	f64	f64	f64	f64
0	0.779298	0.901811	0.091126	1.41107
1	-0.936034	-2.621484	0.237825	1.146001
2	-1.192351	-3.360748	1.563503	0.1362
3	0.933616	-0.613897	-1.318911	0.193137
4	-0.195308	-0.5624	0.118895	0.222491
5	0.876072	-0.813732	1.367814	0.554935
6	0.73751	0.994505	1.104228	0.4458
7	null	null	null	0.959163
8	null	null	null	1.144944
9	null	null	null	0.158198

# df - df.iloc[0]
df.select(pl.all() - pl.all().first())

shape: (10, 4)

A	B	C	D
f64	f64	f64	f64
0.0	0.0	0.0	0.0
-0.755136	-2.481002	0.449509	-0.265068
-0.660488	-3.696891	1.680688	-1.27487
1.186271	-2.438415	-0.240415	-1.217932
-0.508953	-1.89029	1.344603	-1.188579
-0.631357	-2.716706	0.657985	-0.856135
0.117999	-1.390857	0.727739	-0.96527
0.549259	-1.176227	2.202544	-0.451907
0.772859	-1.250499	2.459803	-0.266126
-0.366154	-0.029816	1.599533	-1.252872

df * 5 + 2
df.select(pl.all() * 5 + 2) # or this

shape: (10, 4)

A	B	C	D
f64	f64	f64	f64
0.646731	10.257392	-3.479233	9.055349
-3.128947	-2.147616	-1.231688	7.730007
-2.65571	-8.227061	4.924206	2.681001
6.578087	-1.934685	-4.681306	2.965687
-1.898036	0.80594	3.243782	3.112453
-2.510055	-3.32614	-0.189309	4.774675
1.236725	3.303107	0.159463	4.228999
3.393025	4.376257	7.533488	6.795814
4.511027	4.004899	8.819779	7.724719
-1.184039	10.108311	4.518431	2.790988

df.select((1 / pl.all()).name.keep())

shape: (10, 4)

A	B	C	D
f64	f64	f64	f64
-3.694757	0.605518	-0.912536	0.708682
-0.974859	-1.205512	-1.547179	0.872599
-1.07395	-0.488899	1.709866	7.342137
1.092159	-1.27075	-0.748357	5.177661
-1.282697	-4.187395	4.019998	4.494571
-1.108634	-0.938766	-2.283826	1.802013
-6.550721	3.836984	-2.716598	2.243159
3.58931	2.104149	0.903589	1.042576
1.991217	2.493891	0.733162	0.873405
-1.570332	0.616651	1.985363	6.321207

df.select(pl.all() ** 4)

shape: (10, 4)

A	B	C	D
f64	f64	f64	f64
0.005366	7.438608	1.442113	3.964551
1.107218	0.473494	0.174517	1.724807
0.751733	17.503438	0.116991	0.000344
0.702839	0.383495	3.188347	0.001391
0.369406	0.003253	0.003829	0.00245
0.661984	1.287568	0.036758	0.094835
0.000543	0.004614	0.018361	0.039497
0.006025	0.051015	1.500085	0.846387
0.06361	0.025852	3.460997	1.718449
0.16445	6.915785	0.064364	0.000626

df1 = pl.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}).cast(pl.Boolean)
df2 = pl.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}).cast(pl.Boolean)

# df1 & df2
align_op(df1.with_row_index(), df2.with_row_index(), pl.Expr.and_)

shape: (3, 3)

index	a	b
u32	bool	bool
0	false	false
1	false	true
2	true	false

# df1 | df2
align_op(df1.with_row_index(), df2.with_row_index(), pl.Expr.or_)

shape: (3, 3)

index	a	b
u32	bool	bool
0	true	true
1	true	true
2	true	true

# df1 ^ df2
align_op(df1.with_row_index(), df2.with_row_index(), pl.Expr.xor)

shape: (3, 3)

index	a	b
u32	bool	bool
0	true	true
1	true	false
2	false	true

# -df1
df1.select(pl.all().not_())

shape: (3, 2)

a	b
bool	bool
false	true
true	false
false	false

Transposing#

df.slice(0, 5).transpose(include_header=True, header_name='index')

shape: (4, 6)

index	column_0	column_1	column_2	column_3	column_4
str	f64	f64	f64	f64	f64
"A"	-0.270654	-1.025789	-0.931142	0.915617	-0.779607
"B"	1.651478	-0.829523	-2.045412	-0.786937	-0.238812
"C"	-1.095847	-0.646338	0.584841	-1.336261	0.248756
"D"	1.41107	1.146001	0.1362	0.193137	0.222491

DataFrame interoperability with NumPy functions#

np.exp(df)

array([[0.76288056, 5.2146839 , 0.33425647, 4.10033964],
       [0.35851332, 0.43625722, 0.52396123, 3.14558954],
       [0.39410336, 0.12932687, 1.79470589, 1.14591119],
       [2.49831716, 0.45523708, 0.26282647, 1.21304945],
       [0.4585861 , 0.78756298, 1.28242953, 1.24918415],
       [0.4057529 , 0.34464929, 0.64541498, 1.74182763],
       [0.85842587, 1.29773619, 0.69204278, 1.56173886],
       [1.32128542, 1.60841852, 3.02435428, 2.60951072],
       [1.65236152, 1.49328708, 3.91163667, 3.14226478],
       [0.52897833, 5.06149624, 1.65480988, 1.17139769]])

np.asarray(df)

array([[-0.27065379,  1.65147847, -1.0958467 ,  1.41106981],
       [-1.02578947, -0.82952326, -0.64633759,  1.14600133],
       [-0.93114207, -2.04541218,  0.58484116,  0.13620012],
       [ 0.91561737, -0.78693694, -1.33626128,  0.19313739],
       [-0.77960722, -0.23881194,  0.24875635,  0.22249066],
       [-0.90201093, -1.06522792, -0.4378618 ,  0.55493493],
       [-0.15265495,  0.26062136, -0.3681075 ,  0.44579985],
       [ 0.27860506,  0.47525141,  1.10669761,  0.95916274],
       [ 0.50220549,  0.40097979,  1.36395587,  1.14494381],
       [-0.63680781,  1.62166214,  0.50368613,  0.15819764]])

ser = pl.Series([1, 2, 3, 4])
np.exp(ser)

shape: (4,)


f64
2.718282
7.389056
20.085537
54.59815

ser1 = pl.DataFrame(dict(value=[1, 2, 3], index=["a", "b", "c"]))
ser2 = pl.DataFrame(dict(value=[1, 3, 5], index=["b", "a", "c"]))
row(ser1, ser2)

shape: (3, 2)

value	index
i64	str
1	"a"
2	"b"
3	"c"

shape: (3, 2)

value	index
i64	str
1	"b"
3	"a"
5	"c"

# np.remainder(ser1, ser2)
ser1_a, ser2_a = pl.align_frames(ser1, ser2, on='index')
row(ser1_a, ser2_a, pl.select(
    index=ser1_a['index'],
    value=np.remainder(ser1_a['value'], ser2_a['value']))   
)

shape: (3, 2)

value	index
i64	str
1	"a"
2	"b"
3	"c"

shape: (3, 2)

value	index
i64	str
3	"a"
1	"b"
5	"c"

shape: (3, 2)

index	value
str	i64
"a"	1
"b"	0
"c"	3

Console display#

baseball = pl.read_csv('data/baseball.csv')
print(baseball)

shape: (100, 23)
┌───────┬───────────┬──────┬───────┬───┬─────┬─────┬─────┬──────┐
│ id    ┆ player    ┆ year ┆ stint ┆ … ┆ hbp ┆ sh  ┆ sf  ┆ gidp │
│ ---   ┆ ---       ┆ ---  ┆ ---   ┆   ┆ --- ┆ --- ┆ --- ┆ ---  │
│ i64   ┆ str       ┆ i64  ┆ i64   ┆   ┆ f64 ┆ f64 ┆ f64 ┆ f64  │
╞═══════╪═══════════╪══════╪═══════╪═══╪═════╪═════╪═════╪══════╡
│ 88641 ┆ womacto01 ┆ 2006 ┆ 2     ┆ … ┆ 0.0 ┆ 3.0 ┆ 0.0 ┆ 0.0  │
│ 88643 ┆ schilcu01 ┆ 2006 ┆ 1     ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0  │
│ 88645 ┆ myersmi01 ┆ 2006 ┆ 1     ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0  │
│ 88649 ┆ helliri01 ┆ 2006 ┆ 1     ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0  │
│ 88650 ┆ johnsra05 ┆ 2006 ┆ 1     ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0  │
│ …     ┆ …         ┆ …    ┆ …     ┆ … ┆ …   ┆ …   ┆ …   ┆ …    │
│ 89525 ┆ benitar01 ┆ 2007 ┆ 2     ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0  │
│ 89526 ┆ benitar01 ┆ 2007 ┆ 1     ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0  │
│ 89530 ┆ ausmubr01 ┆ 2007 ┆ 1     ┆ … ┆ 6.0 ┆ 4.0 ┆ 1.0 ┆ 11.0 │
│ 89533 ┆ aloumo01  ┆ 2007 ┆ 1     ┆ … ┆ 2.0 ┆ 0.0 ┆ 3.0 ┆ 13.0 │
│ 89534 ┆ alomasa02 ┆ 2007 ┆ 1     ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0  │
└───────┴───────────┴──────┴───────┴───┴─────┴─────┴─────┴──────┘

baseball.glimpse()

Rows: 100
Columns: 23
$ id     <i64> 88641, 88643, 88645, 88649, 88650, 88652, 88653, 88662, 89177, 89178
$ player <str> 'womacto01', 'schilcu01', 'myersmi01', 'helliri01', 'johnsra05', 'finlest01', 'gonzalu01', 'seleaa01', 'francju01', 'francju01'
$ year   <i64> 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2007, 2007
$ stint  <i64> 2, 1, 1, 1, 1, 1, 1, 1, 2, 1
$ team   <str> 'CHN', 'BOS', 'NYA', 'MIL', 'NYA', 'SFN', 'ARI', 'LAN', 'ATL', 'NYN'
$ lg     <str> 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'NL', 'NL', 'NL', 'NL'
$ g      <i64> 19, 31, 62, 20, 33, 139, 153, 28, 15, 40
$ ab     <i64> 50, 2, 0, 3, 6, 426, 586, 26, 40, 50
$ r      <i64> 6, 0, 0, 0, 0, 66, 93, 2, 1, 7
$ h      <i64> 14, 1, 0, 0, 1, 105, 159, 5, 10, 10
$ X2b    <i64> 1, 0, 0, 0, 0, 21, 52, 1, 3, 0
$ X3b    <i64> 0, 0, 0, 0, 0, 12, 2, 0, 0, 0
$ hr     <i64> 1, 0, 0, 0, 0, 6, 15, 0, 0, 1
$ rbi    <f64> 2.0, 0.0, 0.0, 0.0, 0.0, 40.0, 73.0, 0.0, 8.0, 8.0
$ sb     <f64> 1.0, 0.0, 0.0, 0.0, 0.0, 7.0, 0.0, 0.0, 0.0, 2.0
$ cs     <f64> 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0
$ bb     <i64> 4, 0, 0, 0, 0, 46, 69, 1, 4, 10
$ so     <f64> 4.0, 1.0, 0.0, 2.0, 4.0, 55.0, 58.0, 7.0, 10.0, 13.0
$ ibb    <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 10.0, 0.0, 1.0, 0.0
$ hbp    <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 7.0, 0.0, 0.0, 0.0
$ sh     <f64> 3.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 6.0, 0.0, 0.0
$ sf     <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 6.0, 0.0, 1.0, 1.0
$ gidp   <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 6.0, 14.0, 1.0, 1.0, 1.0

print(
    baseball.select(pl.nth(range(0, 12)).tail(20))
)

shape: (20, 12)
┌───────┬───────────┬──────┬───────┬───┬─────┬─────┬─────┬─────┐
│ id    ┆ player    ┆ year ┆ stint ┆ … ┆ r   ┆ h   ┆ X2b ┆ X3b │
│ ---   ┆ ---       ┆ ---  ┆ ---   ┆   ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64   ┆ str       ┆ i64  ┆ i64   ┆   ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═══════╪═══════════╪══════╪═══════╪═══╪═════╪═════╪═════╪═════╡
│ 89474 ┆ finlest01 ┆ 2007 ┆ 1     ┆ … ┆ 9   ┆ 17  ┆ 3   ┆ 0   │
│ 89480 ┆ embreal01 ┆ 2007 ┆ 1     ┆ … ┆ 0   ┆ 0   ┆ 0   ┆ 0   │
│ 89481 ┆ edmonji01 ┆ 2007 ┆ 1     ┆ … ┆ 39  ┆ 92  ┆ 15  ┆ 2   │
│ 89482 ┆ easleda01 ┆ 2007 ┆ 1     ┆ … ┆ 24  ┆ 54  ┆ 6   ┆ 0   │
│ 89489 ┆ delgaca01 ┆ 2007 ┆ 1     ┆ … ┆ 71  ┆ 139 ┆ 30  ┆ 0   │
│ …     ┆ …         ┆ …    ┆ …     ┆ … ┆ …   ┆ …   ┆ …   ┆ …   │
│ 89525 ┆ benitar01 ┆ 2007 ┆ 2     ┆ … ┆ 0   ┆ 0   ┆ 0   ┆ 0   │
│ 89526 ┆ benitar01 ┆ 2007 ┆ 1     ┆ … ┆ 0   ┆ 0   ┆ 0   ┆ 0   │
│ 89530 ┆ ausmubr01 ┆ 2007 ┆ 1     ┆ … ┆ 38  ┆ 82  ┆ 16  ┆ 3   │
│ 89533 ┆ aloumo01  ┆ 2007 ┆ 1     ┆ … ┆ 51  ┆ 112 ┆ 19  ┆ 1   │
│ 89534 ┆ alomasa02 ┆ 2007 ┆ 1     ┆ … ┆ 1   ┆ 3   ┆ 1   ┆ 0   │
└───────┴───────────┴──────┴───────┴───┴─────┴─────┴─────┴─────┘

print(pl.DataFrame(np.random.randn(3, 12)))

shape: (3, 12)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ column_0 ┆ column_1  ┆ column_2  ┆ column_3  ┆ … ┆ column_8  ┆ column_9  ┆ column_10 ┆ column_11 │
│ ---      ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ f64      ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0.545788 ┆ -0.786944 ┆ 0.511419  ┆ -1.465452 ┆ … ┆ 0.103465  ┆ 1.25846   ┆ -1.961481 ┆ -0.892518 │
│ 0.828004 ┆ 0.291711  ┆ -0.666151 ┆ -0.056065 ┆ … ┆ -1.487244 ┆ -0.325992 ┆ -0.25713  ┆ -0.381324 │
│ 0.44002  ┆ 0.21451   ┆ 0.285626  ┆ -0.689011 ┆ … ┆ -0.06698  ┆ 1.482008  ┆ 1.288873  ┆ -1.12619  │
└──────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘

with pl.Config() as cfg:
    cfg.set_tbl_width_chars(40)
    print(pl.DataFrame(np.random.randn(3, 12)))

shape: (3, 12)
┌─────┬─────┬─────┬─────┬───┬─────┬─────┬─────┬─────┐
│ col ┆ col ┆ col ┆ col ┆ … ┆ col ┆ col ┆ col ┆ col │
│ umn ┆ umn ┆ umn ┆ umn ┆   ┆ umn ┆ umn ┆ umn ┆ umn │
│ _0  ┆ _1  ┆ _2  ┆ _3  ┆   ┆ _8  ┆ _9  ┆ _10 ┆ _11 │
│ --- ┆ --- ┆ --- ┆ --- ┆   ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 ┆   ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════╪═════╪═════╪═════╪═══╪═════╪═════╪═════╪═════╡
│ -1. ┆ 0.0 ┆ -0. ┆ 1.7 ┆ … ┆ 0.8 ┆ 0.3 ┆ -1. ┆ -1. │
│ 593 ┆ 321 ┆ 382 ┆ 495 ┆   ┆ 431 ┆ 008 ┆ 529 ┆ 861 │
│ 329 ┆ 95  ┆ 463 ┆ 8   ┆   ┆ 73  ┆ 16  ┆ 093 ┆ 916 │
│ 0.5 ┆ -0. ┆ 0.5 ┆ -0. ┆ … ┆ -0. ┆ -0. ┆ -1. ┆ 0.6 │
│ 488 ┆ 337 ┆ 436 ┆ 509 ┆   ┆ 462 ┆ 292 ┆ 385 ┆ 538 │
│ 96  ┆ 439 ┆ 99  ┆ 815 ┆   ┆ 243 ┆ 441 ┆ 327 ┆ 87  │
│ -1. ┆ -0. ┆ 0.7 ┆ -0. ┆ … ┆ 0.1 ┆ -0. ┆ -0. ┆ 0.5 │
│ 150 ┆ 856 ┆ 171 ┆ 110 ┆   ┆ 902 ┆ 484 ┆ 245 ┆ 015 │
│ 248 ┆ 766 ┆ 96  ┆ 738 ┆   ┆ 8   ┆ 427 ┆ 482 ┆ 93  │
└─────┴─────┴─────┴─────┴───┴─────┴─────┴─────┴─────┘

datafile = {
    "filename": ["filename_01", "filename_02"],
    "path": [
        "media/user_name/storage/folder_01/filename_01",
        "media/user_name/storage/folder_02/filename_02",
    ],
}
with pl.Config() as cfg:
    cfg.set_tbl_width_chars(30)
    print(pl.DataFrame(datafile))

shape: (2, 2)
┌─────────────┬──────────────┐
│ filename    ┆ path         │
│ ---         ┆ ---          │
│ str         ┆ str          │
╞═════════════╪══════════════╡
│ filename_01 ┆ media/user_n │
│             ┆ ame/storage/ │
│             ┆ folder…      │
│ filename_02 ┆ media/user_n │
│             ┆ ame/storage/ │
│             ┆ folder…      │
└─────────────┴──────────────┘

with pl.Config() as cfg:
    cfg.set_tbl_width_chars(100)
    print(pl.DataFrame(datafile))

shape: (2, 2)
┌─────────────┬─────────────────────────────────┐
│ filename    ┆ path                            │
│ ---         ┆ ---                             │
│ str         ┆ str                             │
╞═════════════╪═════════════════════════════════╡
│ filename_01 ┆ media/user_name/storage/folder… │
│ filename_02 ┆ media/user_name/storage/folder… │
└─────────────┴─────────────────────────────────┘

DataFrame column attribute access and IPython completion#

There is no such function in polars.

Intro to data structures

Contents

Intro to data structures#

Series#

Series is ndarray-like#

Series is dict-like#

Vectorized operations and label alignment with Series#

Name attribute#

DataFrame#

From dict of ndarrays / lists#

From structured or record array#

From a list of dicts#

From a dict of tuples#

From a Series#

From a list of namedtuples#

From a list of dataclasses#

Alternate constructors#

DataFrame.from_dict#

DataFrame.from_records#

Column selection, addition, deletion#

Assigning new columns in method chains#

Indexing / selection#

Data alignment and arithmetic#

Transposing#

DataFrame interoperability with NumPy functions#

Console display#

DataFrame column attribute access and IPython completion#