Intro to data structures#
import numpy as np
import polars as pl
from helper.jupyter import row
Series#
In Polars, the Series object has no index, so we need to use a two-column DataFrame to simulate the functions of a Series in Pandas.
s = pl.DataFrame(dict(
index=["a", "b", "c", "d", "e"],
value=np.random.randn(5)
))
s
shape: (5, 2)
index | value |
---|---|
str | f64 |
"a" | -1.163544 |
"b" | -0.900663 |
"c" | 1.657318 |
"d" | -1.240945 |
"e" | 0.964103 |
s['index']
shape: (5,)
index |
---|
str |
"a" |
"b" |
"c" |
"d" |
"e" |
pl.Series(np.random.randn(5))
shape: (5,)
f64 |
0.083327 |
-1.077013 |
0.120624 |
-0.480749 |
-1.196976 |
# pd.Series(d)
d = {"b": 1, "a": 0, "c": 2}
pl.DataFrame(list(d.items()), schema=['index', 'value'], orient='row')
shape: (3, 2)
index | value |
---|---|
str | i64 |
"b" | 1 |
"a" | 0 |
"c" | 2 |
pl.select(
index=pl.Series(["a", "b", "c", "d", "e"]),
value=5.0
)
shape: (5, 2)
index | value |
---|---|
str | f64 |
"a" | 5.0 |
"b" | 5.0 |
"c" | 5.0 |
"d" | 5.0 |
"e" | 5.0 |
Series is ndarray-like#
s['value'][0]
-1.163544306576043
s['value'][:3]
shape: (3,)
value |
---|
f64 |
-1.163544 |
-0.900663 |
1.657318 |
# s[s > s.median()]
s.filter(pl.col('value') > pl.col('value').median())
shape: (2, 2)
index | value |
---|---|
str | f64 |
"c" | 1.657318 |
"e" | 0.964103 |
# s.iloc[[4, 3, 1]]
s[[4, 3, 1]]
shape: (3, 2)
index | value |
---|---|
str | f64 |
"e" | 0.964103 |
"d" | -1.240945 |
"b" | -0.900663 |
s.with_columns(
pl.col('value').exp()
)
shape: (5, 2)
index | value |
---|---|
str | f64 |
"a" | 0.312377 |
"b" | 0.4063 |
"c" | 5.245225 |
"d" | 0.289111 |
"e" | 2.622434 |
s['value'].dtype
Float64
# s.array
s['value'].to_numpy()
array([-1.16354431, -0.90066273, 1.65731812, -1.24094468, 0.96410308])
Series is dict-like#
# s["a"]
s.select(pl.col('value').filter(pl.col('index') == 'a')).item()
-1.163544306576043
# s["e"] = 12.0
s = s.with_columns(
pl.when(pl.col('index') == 'e')
.then(12.0)
.otherwise(pl.col('value'))
.name.keep()
)
# "e" in s
"e" in s['index']
True
"f" in s['index']
False
Vectorized operations and label alignment with Series#
# s + s
from helper.polars import align_op
align_op(s, s, op=pl.Expr.add)
shape: (5, 2)
index | value |
---|---|
str | f64 |
"a" | -2.327089 |
"b" | -1.801325 |
"c" | 3.314636 |
"d" | -2.481889 |
"e" | 24.0 |
# s * 2
s.select(
'index',
pl.col('value') * 2
)
shape: (5, 2)
index | value |
---|---|
str | f64 |
"a" | -2.327089 |
"b" | -1.801325 |
"c" | 3.314636 |
"d" | -2.481889 |
"e" | 24.0 |
# np.exp(s)
s.select(
"index",
pl.col("value").exp()
)
shape: (5, 2)
index | value |
---|---|
str | f64 |
"a" | 0.312377 |
"b" | 0.4063 |
"c" | 5.245225 |
"d" | 0.289111 |
"e" | 162754.791419 |
# s.iloc[1:] + s.iloc[:-1]
align_op(
s.slice(1),
s.slice(0, len(s) - 1),
op=pl.Expr.add,
fill_value=None,
how="full")
shape: (5, 2)
index | value |
---|---|
str | f64 |
"a" | null |
"b" | -1.801325 |
"c" | 3.314636 |
"d" | -2.481889 |
"e" | null |
Name attribute#
s = pl.Series("something", np.random.randn(5))
s
shape: (5,)
something |
---|
f64 |
1.879844 |
1.553987 |
-1.190783 |
-1.452195 |
-0.553582 |
s.name
'something'
s2 = s.rename('different')
s2.name
'different'
DataFrame#
s1 = pl.DataFrame(dict(index=["a", "b", "c"], one=[1.0, 2.0, 3.0]))
s2 = pl.DataFrame(dict(index=["a", "b", "c", "d"], two=[1.0, 2.0, 3.0, 4.0]))
df = s1.join(s2, on='index', how='full', coalesce=True)
df
shape: (4, 3)
index | one | two |
---|---|---|
str | f64 | f64 |
"a" | 1.0 | 1.0 |
"b" | 2.0 | 2.0 |
"c" | 3.0 | 3.0 |
"d" | null | 4.0 |
df['index']
shape: (4,)
index |
---|
str |
"a" |
"b" |
"c" |
"d" |
df.columns
['index', 'one', 'two']
df.drop('index').columns
['one', 'two']
From dict of ndarrays / lists#
d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
pl.DataFrame(d)
shape: (4, 2)
one | two |
---|---|
f64 | f64 |
1.0 | 4.0 |
2.0 | 3.0 |
3.0 | 2.0 |
4.0 | 1.0 |
pl.DataFrame(d).insert_column(0, pl.Series('index', ["a", "b", "c", "d"]))
shape: (4, 3)
index | one | two |
---|---|---|
str | f64 | f64 |
"a" | 1.0 | 4.0 |
"b" | 2.0 | 3.0 |
"c" | 3.0 | 2.0 |
"d" | 4.0 | 1.0 |
From structured or record array#
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
pl.DataFrame(data)
shape: (2, 3)
A | B | C |
---|---|---|
i32 | f32 | binary |
1 | 2.0 | b"Hello" |
2 | 3.0 | b"World" |
pl.DataFrame(data).insert_column(0, pl.Series('index', ['first', 'second']))
shape: (2, 4)
index | A | B | C |
---|---|---|---|
str | i32 | f32 | binary |
"first" | 1 | 2.0 | b"Hello" |
"second" | 2 | 3.0 | b"World" |
pl.DataFrame(data).select("C", "A", "B")
shape: (2, 3)
C | A | B |
---|---|---|
binary | i32 | f32 |
b"Hello" | 1 | 2.0 |
b"World" | 2 | 3.0 |
From a list of dicts#
data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
pl.DataFrame(data2)
shape: (2, 3)
a | b | c |
---|---|---|
i64 | i64 | i64 |
1 | 2 | null |
5 | 10 | 20 |
pl.DataFrame(data2).insert_column(0, pl.Series('index', ['first', 'second']))
shape: (2, 4)
index | a | b | c |
---|---|---|---|
str | i64 | i64 | i64 |
"first" | 1 | 2 | null |
"second" | 5 | 10 | 20 |
pl.DataFrame(data2).select('a', 'b')
shape: (2, 2)
a | b |
---|---|
i64 | i64 |
1 | 2 |
5 | 10 |
From a dict of tuples#
data = {
("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
}
dfs = []
for key, value in data.items():
rows = []
for key2, value2 in value.items():
data_row = list(key2) + [value2]
rows.append(data_row)
dfs.append(
pl.DataFrame(rows, orient='row', schema=['index0', 'index1', '-'.join(key)])
)
pl.concat(dfs, how="align")
shape: (3, 7)
index0 | index1 | a-b | a-a | a-c | b-a | b-b |
---|---|---|---|---|---|---|
str | str | i64 | i64 | i64 | i64 | i64 |
"A" | "B" | 1 | 4 | 5 | 8 | 10 |
"A" | "C" | 2 | 3 | 6 | 7 | null |
"A" | "D" | null | null | null | null | 9 |
From a Series#
ser = pl.Series('ser', range(3))
pl.DataFrame(ser).insert_column(0, pl.Series('index', list("abc")))
shape: (3, 2)
index | ser |
---|---|
str | i64 |
"a" | 0 |
"b" | 1 |
"c" | 2 |
From a list of namedtuples#
from collections import namedtuple
Point = namedtuple("Point", "x y")
pl.DataFrame([Point(0, 0), Point(0, 3), (2, 3)])
shape: (3, 2)
x | y |
---|---|
i64 | i64 |
0 | 0 |
0 | 3 |
2 | 3 |
Point3D = namedtuple("Point3D", "x y z")
data = [Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]
pl.DataFrame([p._asdict() for p in data])
shape: (3, 3)
x | y | z |
---|---|---|
i64 | i64 | i64 |
0 | 0 | 0 |
0 | 3 | 5 |
2 | 3 | null |
From a list of dataclasses#
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
pl.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
shape: (3, 2)
x | y |
---|---|
i64 | i64 |
0 | 0 |
0 | 3 |
2 | 3 |
Alternate constructors#
DataFrame.from_dict#
pl.DataFrame(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))
shape: (3, 2)
A | B |
---|---|
i64 | i64 |
1 | 4 |
2 | 5 |
3 | 6 |
data = dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])
pl.DataFrame(
list(data.values()), schema=['one', 'two', 'three'], orient='row'
).insert_column(0, pl.Series('index', data.keys()))
shape: (2, 4)
index | one | two | three |
---|---|---|---|
str | i64 | i64 | i64 |
"A" | 1 | 2 | 3 |
"B" | 4 | 5 | 6 |
DataFrame.from_records#
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
pl.DataFrame(data)
shape: (2, 3)
A | B | C |
---|---|---|
i32 | f32 | binary |
1 | 2.0 | b"Hello" |
2 | 3.0 | b"World" |
Column selection, addition, deletion#
df['one']
shape: (4,)
one |
---|
f64 |
1.0 |
2.0 |
3.0 |
null |
df = df.with_columns(
three=pl.col('one') * pl.col('two'),
flag=pl.col('one') > 2
)
# del df["two"]
df = df.drop('two')
#three = df.pop("three")
three = df['three']
df = df.drop('three')
df
shape: (4, 3)
index | one | flag |
---|---|---|
str | f64 | bool |
"a" | 1.0 | false |
"b" | 2.0 | false |
"c" | 3.0 | true |
"d" | null | null |
df = df.with_columns(foo=pl.lit('bar'))
df
shape: (4, 4)
index | one | flag | foo |
---|---|---|---|
str | f64 | bool | str |
"a" | 1.0 | false | "bar" |
"b" | 2.0 | false | "bar" |
"c" | 3.0 | true | "bar" |
"d" | null | null | "bar" |
# df["one_trunc"] = df["one"][:2]
df = df.with_columns(
one_trunc=pl.col('one').slice(0, 2).append(pl.repeat(None, pl.len() - 2))
)
df
shape: (4, 5)
index | one | flag | foo | one_trunc |
---|---|---|---|---|
str | f64 | bool | str | f64 |
"a" | 1.0 | false | "bar" | 1.0 |
"b" | 2.0 | false | "bar" | 2.0 |
"c" | 3.0 | true | "bar" | null |
"d" | null | null | "bar" | null |
df.insert_column(1, df["one"].rename('bar'))
df
shape: (4, 6)
index | bar | one | flag | foo | one_trunc |
---|---|---|---|---|---|
str | f64 | f64 | bool | str | f64 |
"a" | 1.0 | 1.0 | false | "bar" | 1.0 |
"b" | 2.0 | 2.0 | false | "bar" | 2.0 |
"c" | 3.0 | 3.0 | true | "bar" | null |
"d" | null | null | null | "bar" | null |
Assigning new columns in method chains#
iris = pl.read_csv('data/iris.data')
iris.head()
shape: (5, 5)
SepalLength | SepalWidth | PetalLength | PetalWidth | Name |
---|---|---|---|---|
f64 | f64 | f64 | f64 | str |
5.1 | 3.5 | 1.4 | 0.2 | "Iris-setosa" |
4.9 | 3.0 | 1.4 | 0.2 | "Iris-setosa" |
4.7 | 3.2 | 1.3 | 0.2 | "Iris-setosa" |
4.6 | 3.1 | 1.5 | 0.2 | "Iris-setosa" |
5.0 | 3.6 | 1.4 | 0.2 | "Iris-setosa" |
iris.with_columns(
sepal_ratio=pl.col('SepalWidth') / pl.col('SepalLength')
).head()
shape: (5, 6)
SepalLength | SepalWidth | PetalLength | PetalWidth | Name | sepal_ratio |
---|---|---|---|---|---|
f64 | f64 | f64 | f64 | str | f64 |
5.1 | 3.5 | 1.4 | 0.2 | "Iris-setosa" | 0.686275 |
4.9 | 3.0 | 1.4 | 0.2 | "Iris-setosa" | 0.612245 |
4.7 | 3.2 | 1.3 | 0.2 | "Iris-setosa" | 0.680851 |
4.6 | 3.1 | 1.5 | 0.2 | "Iris-setosa" | 0.673913 |
5.0 | 3.6 | 1.4 | 0.2 | "Iris-setosa" | 0.72 |
import hvplot.polars
(
iris
.filter(pl.col('SepalLength') > 5)
.with_columns(
SepalRatio=pl.col('SepalWidth') / pl.col('SepalLength'),
PetalRatio=pl.col('PetalWidth') / pl.col('PetalLength')
)
.hvplot.scatter(x='SepalRatio', y='PetalRatio')
)
dfa = pl.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
(
dfa
.with_columns(C = pl.col('A') + pl.col('B'))
.with_columns(D = pl.col('A') + pl.col('C'))
)
shape: (3, 4)
A | B | C | D |
---|---|---|---|
i64 | i64 | i64 | i64 |
1 | 4 | 5 | 6 |
2 | 5 | 7 | 9 |
3 | 6 | 9 | 12 |
Indexing / selection#
# df.loc["b"]
df.filter(pl.col('index') == 'b')
shape: (1, 6)
index | bar | one | flag | foo | one_trunc |
---|---|---|---|---|---|
str | f64 | f64 | bool | str | f64 |
"b" | 2.0 | 2.0 | false | "bar" | 2.0 |
# df.iloc[2]
df.slice(2, 1)
shape: (1, 6)
index | bar | one | flag | foo | one_trunc |
---|---|---|---|---|---|
str | f64 | f64 | bool | str | f64 |
"c" | 3.0 | 3.0 | true | "bar" | null |
Data alignment and arithmetic#
# df + df2
df = pl.DataFrame(np.random.randn(10, 4), schema=["A", "B", "C", "D"])
df2 = pl.DataFrame(np.random.randn(7, 3), schema=["A", "B", "C"])
align_op(df.with_row_index(), df2.with_row_index(), pl.Expr.add, fill_value=None)
shape: (10, 5)
index | A | B | C | D |
---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 |
0 | 0.779298 | 0.901811 | 0.091126 | 1.41107 |
1 | -0.936034 | -2.621484 | 0.237825 | 1.146001 |
2 | -1.192351 | -3.360748 | 1.563503 | 0.1362 |
3 | 0.933616 | -0.613897 | -1.318911 | 0.193137 |
4 | -0.195308 | -0.5624 | 0.118895 | 0.222491 |
5 | 0.876072 | -0.813732 | 1.367814 | 0.554935 |
6 | 0.73751 | 0.994505 | 1.104228 | 0.4458 |
7 | null | null | null | 0.959163 |
8 | null | null | null | 1.144944 |
9 | null | null | null | 0.158198 |
# df - df.iloc[0]
df.select(pl.all() - pl.all().first())
shape: (10, 4)
A | B | C | D |
---|---|---|---|
f64 | f64 | f64 | f64 |
0.0 | 0.0 | 0.0 | 0.0 |
-0.755136 | -2.481002 | 0.449509 | -0.265068 |
-0.660488 | -3.696891 | 1.680688 | -1.27487 |
1.186271 | -2.438415 | -0.240415 | -1.217932 |
-0.508953 | -1.89029 | 1.344603 | -1.188579 |
-0.631357 | -2.716706 | 0.657985 | -0.856135 |
0.117999 | -1.390857 | 0.727739 | -0.96527 |
0.549259 | -1.176227 | 2.202544 | -0.451907 |
0.772859 | -1.250499 | 2.459803 | -0.266126 |
-0.366154 | -0.029816 | 1.599533 | -1.252872 |
df * 5 + 2
df.select(pl.all() * 5 + 2) # or this
shape: (10, 4)
A | B | C | D |
---|---|---|---|
f64 | f64 | f64 | f64 |
0.646731 | 10.257392 | -3.479233 | 9.055349 |
-3.128947 | -2.147616 | -1.231688 | 7.730007 |
-2.65571 | -8.227061 | 4.924206 | 2.681001 |
6.578087 | -1.934685 | -4.681306 | 2.965687 |
-1.898036 | 0.80594 | 3.243782 | 3.112453 |
-2.510055 | -3.32614 | -0.189309 | 4.774675 |
1.236725 | 3.303107 | 0.159463 | 4.228999 |
3.393025 | 4.376257 | 7.533488 | 6.795814 |
4.511027 | 4.004899 | 8.819779 | 7.724719 |
-1.184039 | 10.108311 | 4.518431 | 2.790988 |
df.select((1 / pl.all()).name.keep())
shape: (10, 4)
A | B | C | D |
---|---|---|---|
f64 | f64 | f64 | f64 |
-3.694757 | 0.605518 | -0.912536 | 0.708682 |
-0.974859 | -1.205512 | -1.547179 | 0.872599 |
-1.07395 | -0.488899 | 1.709866 | 7.342137 |
1.092159 | -1.27075 | -0.748357 | 5.177661 |
-1.282697 | -4.187395 | 4.019998 | 4.494571 |
-1.108634 | -0.938766 | -2.283826 | 1.802013 |
-6.550721 | 3.836984 | -2.716598 | 2.243159 |
3.58931 | 2.104149 | 0.903589 | 1.042576 |
1.991217 | 2.493891 | 0.733162 | 0.873405 |
-1.570332 | 0.616651 | 1.985363 | 6.321207 |
df.select(pl.all() ** 4)
shape: (10, 4)
A | B | C | D |
---|---|---|---|
f64 | f64 | f64 | f64 |
0.005366 | 7.438608 | 1.442113 | 3.964551 |
1.107218 | 0.473494 | 0.174517 | 1.724807 |
0.751733 | 17.503438 | 0.116991 | 0.000344 |
0.702839 | 0.383495 | 3.188347 | 0.001391 |
0.369406 | 0.003253 | 0.003829 | 0.00245 |
0.661984 | 1.287568 | 0.036758 | 0.094835 |
0.000543 | 0.004614 | 0.018361 | 0.039497 |
0.006025 | 0.051015 | 1.500085 | 0.846387 |
0.06361 | 0.025852 | 3.460997 | 1.718449 |
0.16445 | 6.915785 | 0.064364 | 0.000626 |
df1 = pl.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}).cast(pl.Boolean)
df2 = pl.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}).cast(pl.Boolean)
# df1 & df2
align_op(df1.with_row_index(), df2.with_row_index(), pl.Expr.and_)
shape: (3, 3)
index | a | b |
---|---|---|
u32 | bool | bool |
0 | false | false |
1 | false | true |
2 | true | false |
# df1 | df2
align_op(df1.with_row_index(), df2.with_row_index(), pl.Expr.or_)
shape: (3, 3)
index | a | b |
---|---|---|
u32 | bool | bool |
0 | true | true |
1 | true | true |
2 | true | true |
# df1 ^ df2
align_op(df1.with_row_index(), df2.with_row_index(), pl.Expr.xor)
shape: (3, 3)
index | a | b |
---|---|---|
u32 | bool | bool |
0 | true | true |
1 | true | false |
2 | false | true |
# -df1
df1.select(pl.all().not_())
shape: (3, 2)
a | b |
---|---|
bool | bool |
false | true |
true | false |
false | false |
Transposing#
df.slice(0, 5).transpose(include_header=True, header_name='index')
shape: (4, 6)
index | column_0 | column_1 | column_2 | column_3 | column_4 |
---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 |
"A" | -0.270654 | -1.025789 | -0.931142 | 0.915617 | -0.779607 |
"B" | 1.651478 | -0.829523 | -2.045412 | -0.786937 | -0.238812 |
"C" | -1.095847 | -0.646338 | 0.584841 | -1.336261 | 0.248756 |
"D" | 1.41107 | 1.146001 | 0.1362 | 0.193137 | 0.222491 |
DataFrame interoperability with NumPy functions#
np.exp(df)
array([[0.76288056, 5.2146839 , 0.33425647, 4.10033964],
[0.35851332, 0.43625722, 0.52396123, 3.14558954],
[0.39410336, 0.12932687, 1.79470589, 1.14591119],
[2.49831716, 0.45523708, 0.26282647, 1.21304945],
[0.4585861 , 0.78756298, 1.28242953, 1.24918415],
[0.4057529 , 0.34464929, 0.64541498, 1.74182763],
[0.85842587, 1.29773619, 0.69204278, 1.56173886],
[1.32128542, 1.60841852, 3.02435428, 2.60951072],
[1.65236152, 1.49328708, 3.91163667, 3.14226478],
[0.52897833, 5.06149624, 1.65480988, 1.17139769]])
np.asarray(df)
array([[-0.27065379, 1.65147847, -1.0958467 , 1.41106981],
[-1.02578947, -0.82952326, -0.64633759, 1.14600133],
[-0.93114207, -2.04541218, 0.58484116, 0.13620012],
[ 0.91561737, -0.78693694, -1.33626128, 0.19313739],
[-0.77960722, -0.23881194, 0.24875635, 0.22249066],
[-0.90201093, -1.06522792, -0.4378618 , 0.55493493],
[-0.15265495, 0.26062136, -0.3681075 , 0.44579985],
[ 0.27860506, 0.47525141, 1.10669761, 0.95916274],
[ 0.50220549, 0.40097979, 1.36395587, 1.14494381],
[-0.63680781, 1.62166214, 0.50368613, 0.15819764]])
ser = pl.Series([1, 2, 3, 4])
np.exp(ser)
shape: (4,)
f64 |
2.718282 |
7.389056 |
20.085537 |
54.59815 |
ser1 = pl.DataFrame(dict(value=[1, 2, 3], index=["a", "b", "c"]))
ser2 = pl.DataFrame(dict(value=[1, 3, 5], index=["b", "a", "c"]))
row(ser1, ser2)
shape: (3, 2)
|
shape: (3, 2)
|
# np.remainder(ser1, ser2)
ser1_a, ser2_a = pl.align_frames(ser1, ser2, on='index')
row(ser1_a, ser2_a, pl.select(
index=ser1_a['index'],
value=np.remainder(ser1_a['value'], ser2_a['value']))
)
shape: (3, 2)
|
shape: (3, 2)
|
shape: (3, 2)
|
Console display#
baseball = pl.read_csv('data/baseball.csv')
print(baseball)
shape: (100, 23)
┌───────┬───────────┬──────┬───────┬───┬─────┬─────┬─────┬──────┐
│ id ┆ player ┆ year ┆ stint ┆ … ┆ hbp ┆ sh ┆ sf ┆ gidp │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 ┆ i64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═══════╪═══════════╪══════╪═══════╪═══╪═════╪═════╪═════╪══════╡
│ 88641 ┆ womacto01 ┆ 2006 ┆ 2 ┆ … ┆ 0.0 ┆ 3.0 ┆ 0.0 ┆ 0.0 │
│ 88643 ┆ schilcu01 ┆ 2006 ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │
│ 88645 ┆ myersmi01 ┆ 2006 ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │
│ 88649 ┆ helliri01 ┆ 2006 ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │
│ 88650 ┆ johnsra05 ┆ 2006 ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ 89525 ┆ benitar01 ┆ 2007 ┆ 2 ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │
│ 89526 ┆ benitar01 ┆ 2007 ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │
│ 89530 ┆ ausmubr01 ┆ 2007 ┆ 1 ┆ … ┆ 6.0 ┆ 4.0 ┆ 1.0 ┆ 11.0 │
│ 89533 ┆ aloumo01 ┆ 2007 ┆ 1 ┆ … ┆ 2.0 ┆ 0.0 ┆ 3.0 ┆ 13.0 │
│ 89534 ┆ alomasa02 ┆ 2007 ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │
└───────┴───────────┴──────┴───────┴───┴─────┴─────┴─────┴──────┘
baseball.glimpse()
Rows: 100
Columns: 23
$ id <i64> 88641, 88643, 88645, 88649, 88650, 88652, 88653, 88662, 89177, 89178
$ player <str> 'womacto01', 'schilcu01', 'myersmi01', 'helliri01', 'johnsra05', 'finlest01', 'gonzalu01', 'seleaa01', 'francju01', 'francju01'
$ year <i64> 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2007, 2007
$ stint <i64> 2, 1, 1, 1, 1, 1, 1, 1, 2, 1
$ team <str> 'CHN', 'BOS', 'NYA', 'MIL', 'NYA', 'SFN', 'ARI', 'LAN', 'ATL', 'NYN'
$ lg <str> 'NL', 'AL', 'AL', 'NL', 'AL', 'NL', 'NL', 'NL', 'NL', 'NL'
$ g <i64> 19, 31, 62, 20, 33, 139, 153, 28, 15, 40
$ ab <i64> 50, 2, 0, 3, 6, 426, 586, 26, 40, 50
$ r <i64> 6, 0, 0, 0, 0, 66, 93, 2, 1, 7
$ h <i64> 14, 1, 0, 0, 1, 105, 159, 5, 10, 10
$ X2b <i64> 1, 0, 0, 0, 0, 21, 52, 1, 3, 0
$ X3b <i64> 0, 0, 0, 0, 0, 12, 2, 0, 0, 0
$ hr <i64> 1, 0, 0, 0, 0, 6, 15, 0, 0, 1
$ rbi <f64> 2.0, 0.0, 0.0, 0.0, 0.0, 40.0, 73.0, 0.0, 8.0, 8.0
$ sb <f64> 1.0, 0.0, 0.0, 0.0, 0.0, 7.0, 0.0, 0.0, 0.0, 2.0
$ cs <f64> 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0
$ bb <i64> 4, 0, 0, 0, 0, 46, 69, 1, 4, 10
$ so <f64> 4.0, 1.0, 0.0, 2.0, 4.0, 55.0, 58.0, 7.0, 10.0, 13.0
$ ibb <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 10.0, 0.0, 1.0, 0.0
$ hbp <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 7.0, 0.0, 0.0, 0.0
$ sh <f64> 3.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 6.0, 0.0, 0.0
$ sf <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 6.0, 0.0, 1.0, 1.0
$ gidp <f64> 0.0, 0.0, 0.0, 0.0, 0.0, 6.0, 14.0, 1.0, 1.0, 1.0
print(
baseball.select(pl.nth(range(0, 12)).tail(20))
)
shape: (20, 12)
┌───────┬───────────┬──────┬───────┬───┬─────┬─────┬─────┬─────┐
│ id ┆ player ┆ year ┆ stint ┆ … ┆ r ┆ h ┆ X2b ┆ X3b │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 ┆ i64 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═══════╪═══════════╪══════╪═══════╪═══╪═════╪═════╪═════╪═════╡
│ 89474 ┆ finlest01 ┆ 2007 ┆ 1 ┆ … ┆ 9 ┆ 17 ┆ 3 ┆ 0 │
│ 89480 ┆ embreal01 ┆ 2007 ┆ 1 ┆ … ┆ 0 ┆ 0 ┆ 0 ┆ 0 │
│ 89481 ┆ edmonji01 ┆ 2007 ┆ 1 ┆ … ┆ 39 ┆ 92 ┆ 15 ┆ 2 │
│ 89482 ┆ easleda01 ┆ 2007 ┆ 1 ┆ … ┆ 24 ┆ 54 ┆ 6 ┆ 0 │
│ 89489 ┆ delgaca01 ┆ 2007 ┆ 1 ┆ … ┆ 71 ┆ 139 ┆ 30 ┆ 0 │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ 89525 ┆ benitar01 ┆ 2007 ┆ 2 ┆ … ┆ 0 ┆ 0 ┆ 0 ┆ 0 │
│ 89526 ┆ benitar01 ┆ 2007 ┆ 1 ┆ … ┆ 0 ┆ 0 ┆ 0 ┆ 0 │
│ 89530 ┆ ausmubr01 ┆ 2007 ┆ 1 ┆ … ┆ 38 ┆ 82 ┆ 16 ┆ 3 │
│ 89533 ┆ aloumo01 ┆ 2007 ┆ 1 ┆ … ┆ 51 ┆ 112 ┆ 19 ┆ 1 │
│ 89534 ┆ alomasa02 ┆ 2007 ┆ 1 ┆ … ┆ 1 ┆ 3 ┆ 1 ┆ 0 │
└───────┴───────────┴──────┴───────┴───┴─────┴─────┴─────┴─────┘
print(pl.DataFrame(np.random.randn(3, 12)))
shape: (3, 12)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ column_0 ┆ column_1 ┆ column_2 ┆ column_3 ┆ … ┆ column_8 ┆ column_9 ┆ column_10 ┆ column_11 │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0.545788 ┆ -0.786944 ┆ 0.511419 ┆ -1.465452 ┆ … ┆ 0.103465 ┆ 1.25846 ┆ -1.961481 ┆ -0.892518 │
│ 0.828004 ┆ 0.291711 ┆ -0.666151 ┆ -0.056065 ┆ … ┆ -1.487244 ┆ -0.325992 ┆ -0.25713 ┆ -0.381324 │
│ 0.44002 ┆ 0.21451 ┆ 0.285626 ┆ -0.689011 ┆ … ┆ -0.06698 ┆ 1.482008 ┆ 1.288873 ┆ -1.12619 │
└──────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘
with pl.Config() as cfg:
cfg.set_tbl_width_chars(40)
print(pl.DataFrame(np.random.randn(3, 12)))
shape: (3, 12)
┌─────┬─────┬─────┬─────┬───┬─────┬─────┬─────┬─────┐
│ col ┆ col ┆ col ┆ col ┆ … ┆ col ┆ col ┆ col ┆ col │
│ umn ┆ umn ┆ umn ┆ umn ┆ ┆ umn ┆ umn ┆ umn ┆ umn │
│ _0 ┆ _1 ┆ _2 ┆ _3 ┆ ┆ _8 ┆ _9 ┆ _10 ┆ _11 │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════╪═════╪═════╪═════╪═══╪═════╪═════╪═════╪═════╡
│ -1. ┆ 0.0 ┆ -0. ┆ 1.7 ┆ … ┆ 0.8 ┆ 0.3 ┆ -1. ┆ -1. │
│ 593 ┆ 321 ┆ 382 ┆ 495 ┆ ┆ 431 ┆ 008 ┆ 529 ┆ 861 │
│ 329 ┆ 95 ┆ 463 ┆ 8 ┆ ┆ 73 ┆ 16 ┆ 093 ┆ 916 │
│ 0.5 ┆ -0. ┆ 0.5 ┆ -0. ┆ … ┆ -0. ┆ -0. ┆ -1. ┆ 0.6 │
│ 488 ┆ 337 ┆ 436 ┆ 509 ┆ ┆ 462 ┆ 292 ┆ 385 ┆ 538 │
│ 96 ┆ 439 ┆ 99 ┆ 815 ┆ ┆ 243 ┆ 441 ┆ 327 ┆ 87 │
│ -1. ┆ -0. ┆ 0.7 ┆ -0. ┆ … ┆ 0.1 ┆ -0. ┆ -0. ┆ 0.5 │
│ 150 ┆ 856 ┆ 171 ┆ 110 ┆ ┆ 902 ┆ 484 ┆ 245 ┆ 015 │
│ 248 ┆ 766 ┆ 96 ┆ 738 ┆ ┆ 8 ┆ 427 ┆ 482 ┆ 93 │
└─────┴─────┴─────┴─────┴───┴─────┴─────┴─────┴─────┘
datafile = {
"filename": ["filename_01", "filename_02"],
"path": [
"media/user_name/storage/folder_01/filename_01",
"media/user_name/storage/folder_02/filename_02",
],
}
with pl.Config() as cfg:
cfg.set_tbl_width_chars(30)
print(pl.DataFrame(datafile))
shape: (2, 2)
┌─────────────┬──────────────┐
│ filename ┆ path │
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪══════════════╡
│ filename_01 ┆ media/user_n │
│ ┆ ame/storage/ │
│ ┆ folder… │
│ filename_02 ┆ media/user_n │
│ ┆ ame/storage/ │
│ ┆ folder… │
└─────────────┴──────────────┘
with pl.Config() as cfg:
cfg.set_tbl_width_chars(100)
print(pl.DataFrame(datafile))
shape: (2, 2)
┌─────────────┬─────────────────────────────────┐
│ filename ┆ path │
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪═════════════════════════════════╡
│ filename_01 ┆ media/user_name/storage/folder… │
│ filename_02 ┆ media/user_name/storage/folder… │
└─────────────┴─────────────────────────────────┘
DataFrame column attribute access and IPython completion#
There is no such function in polars.