Getting started tutorials#
This notebook contains code that reimplements the examples from the Pandas Getting Started documentation using Polars: https://pandas.pydata.org/docs/getting_started/. I hope this notebook helps you transition from Pandas to Polars more easily. Be mindful that Polars and Pandas have different philosophies, so there are some tasks that are easy to accomplish in Pandas but may require more effort in Polars.
from helper.jupyter import row
import polars as pl
from polars import selectors as cs
import holoviews as hv
import hvplot.polars # need import this to plot with hvplot
What kind of data does pandas handle?#
df = pl.DataFrame(
{
"Name": [
"Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth",
],
"Age": [22, 35, 58],
"Sex": ["male", "male", "female"],
}
)
df
shape: (3, 3)
Name | Age | Sex |
---|---|---|
str | i64 | str |
"Braund, Mr. Owen Harris" | 22 | "male" |
"Allen, Mr. William Henry" | 35 | "male" |
"Bonnell, Miss. Elizabeth" | 58 | "female" |
# df["Age"]
row(
df['Age'],
df.get_column('Age')
)
shape: (3,)
|
shape: (3,)
|
ages = pl.Series('Age', [22, 35, 58])
ages
shape: (3,)
Age |
---|
i64 |
22 |
35 |
58 |
df['Age'].max()
58
ages.max()
58
df.select(pl.col('Age').max()).item()
58
df.describe()
shape: (9, 4)
statistic | Name | Age | Sex |
---|---|---|---|
str | str | f64 | str |
"count" | "3" | 3.0 | "3" |
"null_count" | "0" | 0.0 | "0" |
"mean" | null | 38.333333 | null |
"std" | null | 18.230012 | null |
"min" | "Allen, Mr. William Henry" | 22.0 | "female" |
"25%" | null | 35.0 | null |
"50%" | null | 35.0 | null |
"75%" | null | 58.0 | null |
"max" | "Braund, Mr. Owen Harris" | 58.0 | "male" |
How do I read and write tabular data?#
titanic = pl.read_csv('data/titanic.csv')
titanic.head(3)
shape: (3, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
1 | 0 | 3 | "Braund, Mr. Owen Harris" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
3 | 1 | 3 | "Heikkinen, Miss Laina" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
titanic.tail(3)
shape: (3, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
889 | 0 | 3 | "Johnston, Miss Catherine Helen… | "female" | null | 1 | 2 | "W./C. 6607" | 23.45 | null | "S" |
890 | 1 | 1 | "Behr, Mr. Karl Howell" | "male" | 26.0 | 0 | 0 | "111369" | 30.0 | "C148" | "C" |
891 | 0 | 3 | "Dooley, Mr. Patrick" | "male" | 32.0 | 0 | 0 | "370376" | 7.75 | null | "Q" |
titanic.dtypes
[Int64,
Int64,
Int64,
String,
String,
Float64,
Int64,
Int64,
String,
Float64,
String,
String]
titanic.schema
Schema([('PassengerId', Int64),
('Survived', Int64),
('Pclass', Int64),
('Name', String),
('Sex', String),
('Age', Float64),
('SibSp', Int64),
('Parch', Int64),
('Ticket', String),
('Fare', Float64),
('Cabin', String),
('Embarked', String)])
titanic.write_excel('data/titanic.xlsx', worksheet='passengers');
titanic = pl.read_excel('data/titanic.xlsx', sheet_name='passengers')
# titanic.info()
titanic.glimpse()
Rows: 891
Columns: 12
$ PassengerId <i64> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ Survived <i64> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1
$ Pclass <i64> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2
$ Name <str> 'Braund, Mr. Owen Harris', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'Heikkinen, Miss Laina', 'Futrelle, Mrs. Jacques Heath (Lily May Peel)', 'Allen, Mr. William Henry', 'Moran, Mr. James', 'McCarthy, Mr. Timothy J', 'Palsson, Master Gosta Leonard', 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)', 'Nasser, Mrs. Nicholas (Adele Achem)'
$ Sex <str> 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female'
$ Age <f64> 22.0, 38.0, 26.0, 35.0, 35.0, None, 54.0, 2.0, 27.0, 14.0
$ SibSp <i64> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1
$ Parch <i64> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0
$ Ticket <str> 'A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450', '330877', '17463', '349909', '347742', '237736'
$ Fare <f64> 7.25, 71.2833, 7.925, 53.1, 8.05, 8.4583, 51.8625, 21.075, 11.1333, 30.0708
$ Cabin <str> None, 'C85', None, 'C123', None, None, 'E46', None, None, None
$ Embarked <str> 'S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C'
titanic.estimated_size()
85640
titanic.null_count()
shape: (1, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
0 | 0 | 0 | 0 | 0 | 177 | 0 | 0 | 0 | 0 | 687 | 2 |
titanic.n_unique()
891
titanic.select(pl.all().n_unique())
shape: (1, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
891 | 2 | 3 | 891 | 2 | 89 | 7 | 7 | 681 | 248 | 148 | 4 |
How do I select a subset of a DataFrame?#
How do I select specific columns from a DataFrame?#
age = titanic['Age']
age = titanic.get_column('Age') # or this
age.head(5)
shape: (5,)
Age |
---|
f64 |
22.0 |
38.0 |
26.0 |
35.0 |
35.0 |
type(age)
polars.series.series.Series
age.shape
(891,)
age_sex = titanic[['Age', 'Sex']]
age_sex = titanic.select('Age', 'Sex') #or this
age_sex.head(5)
shape: (5, 2)
Age | Sex |
---|---|
f64 | str |
22.0 | "male" |
38.0 | "female" |
26.0 | "female" |
35.0 | "female" |
35.0 | "male" |
type(age_sex)
polars.dataframe.frame.DataFrame
age_sex.shape
(891, 2)
How do I filter specific rows from a DataFrame?#
# above_35 = titanic[titanic["Age"] > 35]
above_35 = titanic.filter(pl.col('Age') > 35)
above_35.head(5)
shape: (5, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
7 | 0 | 1 | "McCarthy, Mr. Timothy J" | "male" | 54.0 | 0 | 0 | "17463" | 51.8625 | "E46" | "S" |
12 | 1 | 1 | "Bonnell, Miss Elizabeth" | "female" | 58.0 | 0 | 0 | "113783" | 26.55 | "C103" | "S" |
14 | 0 | 3 | "Andersson, Mr. Anders Johan" | "male" | 39.0 | 1 | 5 | "347082" | 31.275 | null | "S" |
16 | 1 | 2 | "Hewlett, Mrs. (Mary D Kingcome… | "female" | 55.0 | 0 | 0 | "248706" | 16.0 | null | "S" |
# titanic["Age"] > 35
row(
titanic['Age'] > 35,
titanic.select(pl.col('Age') > 35),
titanic.select(pl.col('Age') > 35).to_series()
)
shape: (891,)
|
shape: (891, 1)
|
shape: (891,)
|
above_35.shape
(217, 12)
# class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23 = titanic.filter(pl.col('Pclass').is_in([2, 3]))
class_23.head(5)
shape: (5, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
1 | 0 | 3 | "Braund, Mr. Owen Harris" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
3 | 1 | 3 | "Heikkinen, Miss Laina" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
5 | 0 | 3 | "Allen, Mr. William Henry" | "male" | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
6 | 0 | 3 | "Moran, Mr. James" | "male" | null | 0 | 0 | "330877" | 8.4583 | null | "Q" |
8 | 0 | 3 | "Palsson, Master Gosta Leonard" | "male" | 2.0 | 3 | 1 | "349909" | 21.075 | null | "S" |
# class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
class_23 = titanic.filter(
(pl.col('Pclass') == 2) | (pl.col('Pclass') == 3)
)
class_23 = titanic.filter(
pl.col('Pclass').eq(2).or_(pl.col('Pclass').eq(3))
)
# age_no_na = titanic[titanic["Age"].notna()]
age_no_na = titanic.filter(pl.col('Age').is_not_null())
age_no_na.head()
shape: (5, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
1 | 0 | 3 | "Braund, Mr. Owen Harris" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
3 | 1 | 3 | "Heikkinen, Miss Laina" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | "female" | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
5 | 0 | 3 | "Allen, Mr. William Henry" | "male" | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
age_no_na.shape
(714, 12)
How do I select specific rows and columns from a DataFrame?#
# adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
adult_names = (
titanic
.filter(pl.col('Age') > 35)
.select('Name')
.to_series()
)
adult_names.head(5)
shape: (5,)
Name |
---|
str |
"Cumings, Mrs. John Bradley (Fl… |
"McCarthy, Mr. Timothy J" |
"Bonnell, Miss Elizabeth" |
"Andersson, Mr. Anders Johan" |
"Hewlett, Mrs. (Mary D Kingcome… |
# titanic.iloc[9:25, 2:5]
titanic.select(pl.nth(range(2, 5)).slice(9, 25 - 9))
shape: (16, 3)
Pclass | Name | Sex |
---|---|---|
i64 | str | str |
2 | "Nasser, Mrs. Nicholas (Adele A… | "female" |
3 | "Sandstrom, Miss Marguerite Rut" | "female" |
1 | "Bonnell, Miss Elizabeth" | "female" |
3 | "Saundercock, Mr. William Henry" | "male" |
3 | "Andersson, Mr. Anders Johan" | "male" |
… | … | … |
2 | "Fynney, Mr. Joseph J" | "male" |
2 | "Beesley, Mr. Lawrence" | "male" |
3 | "McGowan, Miss Anna "Annie"" | "female" |
1 | "Sloper, Mr. William Thompson" | "male" |
3 | "Palsson, Miss Torborg Danira" | "female" |
# titanic.iloc[0:3, 3] = "anonymous"
titanic = titanic.with_columns(
pl.when(pl.int_range(pl.len()) < 3)
.then(pl.lit('anonymous'))
.otherwise(pl.nth(3))
.name.keep()
)
titanic.head(5)
shape: (5, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
1 | 0 | 3 | "anonymous" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
2 | 1 | 1 | "anonymous" | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
3 | 1 | 3 | "anonymous" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | "female" | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
5 | 0 | 3 | "Allen, Mr. William Henry" | "male" | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
How do I create plots in pandas?#
air_quality = pl.read_csv("data/air_quality_no2.csv", try_parse_dates=True)
air_quality.head(5)
shape: (5, 4)
datetime | station_antwerp | station_paris | station_london |
---|---|---|---|
datetime[μs] | f64 | f64 | f64 |
2019-05-07 02:00:00 | null | null | 23.0 |
2019-05-07 03:00:00 | 50.5 | 25.0 | 19.0 |
2019-05-07 04:00:00 | 45.0 | 27.7 | 19.0 |
2019-05-07 05:00:00 | null | 50.4 | 16.0 |
2019-05-07 06:00:00 | null | 61.9 | null |
air_quality.hvplot.line(x='datetime')
air_quality.hvplot.line(x='datetime', y='station_paris')
air_quality.hvplot.scatter(x="station_london", y="station_paris", alpha=0.5)
[
method_name
for method_name in dir(air_quality.hvplot)
if not method_name.startswith("_")
]
['area',
'bar',
'barh',
'bivariate',
'box',
'dataset',
'density',
'errorbars',
'explorer',
'heatmap',
'hexbin',
'hist',
'kde',
'labels',
'line',
'ohlc',
'paths',
'points',
'polygons',
'scatter',
'step',
'table',
'vectorfield',
'violin']
air_quality.hvplot.box()
# air_quality.plot.area(figsize=(12, 4), subplots=True)
(
air_quality
.unpivot(index="datetime", variable_name="station", value_name="quality")
.hvplot.area(x='datetime', y='quality', groupby="station")
.opts(height=180)
.layout()
.cols(1)
)
fig = (
air_quality
.unpivot(index="datetime", variable_name="station", value_name="quality")
.with_columns(
pl.col('quality').fill_null(0)
)
.hvplot.area(x='datetime', y='quality', groupby='station')
.redim(quality=r"NO2 concentration")
.opts(alpha=0.8)
.overlay()
)
fig
hv.save(fig, 'data/no2_concentrations.html')
How to create new columns derived from existing columns#
air_quality = pl.read_csv("data/air_quality_no2.csv", try_parse_dates=True)
air_quality.head(5)
shape: (5, 4)
datetime | station_antwerp | station_paris | station_london |
---|---|---|---|
datetime[μs] | f64 | f64 | f64 |
2019-05-07 02:00:00 | null | null | 23.0 |
2019-05-07 03:00:00 | 50.5 | 25.0 | 19.0 |
2019-05-07 04:00:00 | 45.0 | 27.7 | 19.0 |
2019-05-07 05:00:00 | null | 50.4 | 16.0 |
2019-05-07 06:00:00 | null | 61.9 | null |
# air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.88
air_quality = air_quality.with_columns(
london_mg_per_cubic = pl.col("station_london") * 1.882
)
# air_quality["ratio_paris_antwerp"] = (
# air_quality["station_paris"] / air_quality["station_antwerp"]
# )
air_quality = air_quality.with_columns(
ratio_paris_antwerp = pl.col("station_paris") / pl.col("station_antwerp")
)
air_quality_renamed = air_quality.rename(
{
"station_antwerp": "BETR801",
"station_paris": "FR04014",
"station_london": "London Westminster",
}
)
air_quality_renamed.head(5)
shape: (5, 6)
datetime | BETR801 | FR04014 | London Westminster | london_mg_per_cubic | ratio_paris_antwerp |
---|---|---|---|---|---|
datetime[μs] | f64 | f64 | f64 | f64 | f64 |
2019-05-07 02:00:00 | null | null | 23.0 | 43.286 | null |
2019-05-07 03:00:00 | 50.5 | 25.0 | 19.0 | 35.758 | 0.49505 |
2019-05-07 04:00:00 | 45.0 | 27.7 | 19.0 | 35.758 | 0.615556 |
2019-05-07 05:00:00 | null | 50.4 | 16.0 | 30.112 | null |
2019-05-07 06:00:00 | null | 61.9 | null | null | null |
air_quality_renamed = air_quality_renamed.rename(str.lower)
air_quality_renamed.head(5)
shape: (5, 6)
datetime | betr801 | fr04014 | london westminster | london_mg_per_cubic | ratio_paris_antwerp |
---|---|---|---|---|---|
datetime[μs] | f64 | f64 | f64 | f64 | f64 |
2019-05-07 02:00:00 | null | null | 23.0 | 43.286 | null |
2019-05-07 03:00:00 | 50.5 | 25.0 | 19.0 | 35.758 | 0.49505 |
2019-05-07 04:00:00 | 45.0 | 27.7 | 19.0 | 35.758 | 0.615556 |
2019-05-07 05:00:00 | null | 50.4 | 16.0 | 30.112 | null |
2019-05-07 06:00:00 | null | 61.9 | null | null | null |
How to calculate summary statistics#
Aggregating statistics#
titanic = pl.read_csv("data/titanic.csv")
(
titanic['Age'].mean(),
titanic.select(pl.col('Age').mean()).item() #or this
)
(29.69911764705882, 29.69911764705882)
res1 = titanic[["Age", "Fare"]].median()
res2 = titanic.select(pl.col('Age', 'Fare').median())
row(res1, res2)
shape: (1, 2)
|
shape: (1, 2)
|
res1.transpose(include_header=True, column_names=['Value'])
shape: (2, 2)
column | Value |
---|---|
str | f64 |
"Age" | 28.0 |
"Fare" | 14.4542 |
titanic[["Age", "Fare"]].describe()
shape: (9, 3)
statistic | Age | Fare |
---|---|---|
str | f64 | f64 |
"count" | 714.0 | 891.0 |
"null_count" | 177.0 | 0.0 |
"mean" | 29.699118 | 32.204208 |
"std" | 14.526497 | 49.693429 |
"min" | 0.42 | 0.0 |
"25%" | 20.0 | 7.925 |
"50%" | 28.0 | 14.4542 |
"75%" | 38.0 | 31.0 |
"max" | 80.0 | 512.3292 |
# titanic.agg(
# {
# "Age": ["min", "max", "median", "skew"],
# "Fare": ["min", "max", "median", "mean"],
# }
# )
age, fare = [pl.col(name) for name in ['Age', 'Fare']]
(
titanic
.select(
Age=pl.struct(
min=age.min(),
max=age.max(),
median=age.median(),
skew=age.skew()
),
Fare=pl.struct(
min=fare.min(),
max=fare.max(),
median=fare.median(),
mean=fare.mean()
)
)
.transpose(include_header=True, column_names=['agg'])
.unnest('agg')
)
shape: (2, 6)
column | min | max | median | skew | mean |
---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 |
"Age" | 0.42 | 80.0 | 28.0 | 0.38829 | null |
"Fare" | 0.0 | 512.3292 | 14.4542 | null | 32.204208 |
def agg(df, items):
exprs = {
key: pl.struct(**{agg_name: getattr(pl.col(key), agg_name)() for agg_name in agg_names})
for key, agg_names in items.items()
}
return df.select(**exprs).transpose(include_header=True, column_names=["agg"]).unnest("agg")
agg(
titanic,
{
"Age": ["min", "max", "median", "skew"],
"Fare": ["min", "max", "median", "mean"],
},
)
shape: (2, 6)
column | min | max | median | skew | mean |
---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 |
"Age" | 0.42 | 80.0 | 28.0 | 0.38829 | null |
"Fare" | 0.0 | 512.3292 | 14.4542 | null | 32.204208 |
Aggregating statistics grouped by category#
# titanic[["Sex", "Age"]].groupby("Sex").mean()
titanic.group_by('Sex').agg(pl.col('Age').mean())
shape: (2, 2)
Sex | Age |
---|---|
str | f64 |
"female" | 27.915709 |
"male" | 30.726645 |
# titanic.groupby("Sex").mean(numeric_only=True)
from polars import selectors as cs
titanic.group_by('Sex').agg(cs.numeric().mean())
shape: (2, 8)
Sex | PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare |
---|---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
"female" | 431.028662 | 0.742038 | 2.159236 | 27.915709 | 0.694268 | 0.649682 | 44.479818 |
"male" | 454.147314 | 0.188908 | 2.389948 | 30.726645 | 0.429809 | 0.235702 | 25.523893 |
# titanic.groupby(["Sex", "Pclass"])["Fare"].mean()
(
titanic
.group_by('Sex', 'Pclass')
.agg(pl.col('Fare').mean())
.sort('Sex', 'Pclass')
)
shape: (6, 3)
Sex | Pclass | Fare |
---|---|---|
str | i64 | f64 |
"female" | 1 | 106.125798 |
"female" | 2 | 21.970121 |
"female" | 3 | 16.11881 |
"male" | 1 | 67.226127 |
"male" | 2 | 19.741782 |
"male" | 3 | 12.661633 |
Count number of records by category#
# titanic["Pclass"].value_counts()
res1 = titanic['Pclass'].value_counts().sort('count', descending=True)
res2 = (
titanic
.select(
pl.col('Pclass')
.value_counts()
.struct.unnest()
)
.sort('count', descending=True)
)
row(res1, res2)
shape: (3, 2)
|
shape: (3, 2)
|
# titanic.groupby("Pclass")["Pclass"].count()
titanic.group_by('Pclass').agg(count=pl.len())
shape: (3, 2)
Pclass | count |
---|---|
i64 | u32 |
3 | 491 |
2 | 184 |
1 | 216 |
How to reshape the layout of tables#
titanic = pl.read_csv("data/titanic.csv")
air_quality = pl.read_csv("data/air_quality_long.csv", try_parse_dates=True)
Sort table rows#
titanic.sort('Age', nulls_last=True).head(5)
shape: (5, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
804 | 1 | 3 | "Thomas, Master Assad Alexander" | "male" | 0.42 | 0 | 1 | "2625" | 8.5167 | null | "C" |
756 | 1 | 2 | "Hamalainen, Master Viljo" | "male" | 0.67 | 1 | 1 | "250649" | 14.5 | null | "S" |
470 | 1 | 3 | "Baclini, Miss Helene Barbara" | "female" | 0.75 | 2 | 1 | "2666" | 19.2583 | null | "C" |
645 | 1 | 3 | "Baclini, Miss Eugenie" | "female" | 0.75 | 2 | 1 | "2666" | 19.2583 | null | "C" |
79 | 1 | 2 | "Caldwell, Master Alden Gates" | "male" | 0.83 | 0 | 2 | "248738" | 29.0 | null | "S" |
titanic.sort('Pclass', 'Age', descending=True, nulls_last=True).head()
shape: (5, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
852 | 0 | 3 | "Svensson, Mr. Johan" | "male" | 74.0 | 0 | 0 | "347060" | 7.775 | null | "S" |
117 | 0 | 3 | "Connors, Mr. Patrick" | "male" | 70.5 | 0 | 0 | "370369" | 7.75 | null | "Q" |
281 | 0 | 3 | "Duane, Mr. Frank" | "male" | 65.0 | 0 | 0 | "336439" | 7.75 | null | "Q" |
484 | 1 | 3 | "Turkula, Mrs. (Hedwig)" | "female" | 63.0 | 0 | 0 | "4134" | 9.5875 | null | "S" |
327 | 0 | 3 | "Nysveen, Mr. Johan Hansen" | "male" | 61.0 | 0 | 0 | "345364" | 6.2375 | null | "S" |
Long to wide table format#
no2 = air_quality.filter(pl.col('parameter') == 'no2')
# no2_subset = no2.sort_index().groupby(["location"]).head(2)
no2_subset = (
no2
.sort('date.utc')
.group_by('location')
.agg(pl.all().head(2))
.explode(pl.exclude('location'))
)
no2_subset
shape: (6, 7)
location | city | country | date.utc | parameter | value | unit |
---|---|---|---|---|---|---|
str | str | str | datetime[μs, UTC] | str | f64 | str |
"BETR801" | "Antwerpen" | "BE" | 2019-04-09 01:00:00 UTC | "no2" | 22.5 | "µg/m³" |
"BETR801" | "Antwerpen" | "BE" | 2019-04-09 02:00:00 UTC | "no2" | 53.5 | "µg/m³" |
"FR04014" | "Paris" | "FR" | 2019-04-09 01:00:00 UTC | "no2" | 24.4 | "µg/m³" |
"FR04014" | "Paris" | "FR" | 2019-04-09 02:00:00 UTC | "no2" | 27.4 | "µg/m³" |
"London Westminster" | "London" | "GB" | 2019-04-09 02:00:00 UTC | "no2" | 67.0 | "µg/m³" |
"London Westminster" | "London" | "GB" | 2019-04-09 03:00:00 UTC | "no2" | 67.0 | "µg/m³" |
no2_subset.pivot(on="location", index='date.utc', values='value')
shape: (3, 4)
date.utc | BETR801 | FR04014 | London Westminster |
---|---|---|---|
datetime[μs, UTC] | f64 | f64 | f64 |
2019-04-09 01:00:00 UTC | 22.5 | 24.4 | null |
2019-04-09 02:00:00 UTC | 53.5 | 27.4 | 67.0 |
2019-04-09 03:00:00 UTC | null | null | 67.0 |
no2.head(5)
shape: (5, 7)
city | country | date.utc | location | parameter | value | unit |
---|---|---|---|---|---|---|
str | str | datetime[μs, UTC] | str | str | f64 | str |
"Paris" | "FR" | 2019-06-21 00:00:00 UTC | "FR04014" | "no2" | 20.0 | "µg/m³" |
"Paris" | "FR" | 2019-06-20 23:00:00 UTC | "FR04014" | "no2" | 21.8 | "µg/m³" |
"Paris" | "FR" | 2019-06-20 22:00:00 UTC | "FR04014" | "no2" | 26.5 | "µg/m³" |
"Paris" | "FR" | 2019-06-20 21:00:00 UTC | "FR04014" | "no2" | 24.9 | "µg/m³" |
"Paris" | "FR" | 2019-06-20 20:00:00 UTC | "FR04014" | "no2" | 21.4 | "µg/m³" |
(
no2
.sort('date.utc')
.pivot(on='location', index='date.utc', values='value')
.hvplot.line(x="date.utc")
)
Pivot table#
air_quality.pivot(
on='parameter', index='location', values='value', aggregate_function='mean'
)
shape: (3, 3)
location | pm25 | no2 |
---|---|---|
str | f64 | f64 |
"BETR801" | 23.169492 | 26.95092 |
"London Westminster" | 13.443568 | 29.74005 |
"FR04014" | null | 29.374284 |
air_quality.pivot(
on='parameter', index='location', values='value', aggregate_function='mean'
)
shape: (3, 3)
location | pm25 | no2 |
---|---|---|
str | f64 | f64 |
"BETR801" | 23.169492 | 26.95092 |
"London Westminster" | 13.443568 | 29.74005 |
"FR04014" | null | 29.374284 |
# air_quality.pivot_table(
# values="value",
# index="location",
# columns="parameter",
# aggfunc="mean",
# margins=True,
# )
def pivot_with_margins(df, on, index, expr):
all_on = pl.lit('All').alias(on)
all_index = pl.lit('All').alias(index)
return (
pl.concat([
df.group_by(index, on).agg(expr),
df.group_by(index).agg(all_on, expr),
df.group_by(on).agg(all_index, expr),
df.select(all_on, all_index, expr),
], how='diagonal')
.pivot(on=on, index=index)
)
pivot_with_margins(
air_quality,
on='parameter',
index='location',
expr=pl.col('value').mean()
)
shape: (4, 4)
location | no2 | pm25 | All |
---|---|---|---|
str | f64 | f64 | f64 |
"FR04014" | 29.374284 | null | 29.374284 |
"BETR801" | 26.95092 | 23.169492 | 24.982353 |
"London Westminster" | 29.74005 | 13.443568 | 21.491708 |
"All" | 29.430316 | 14.386849 | 24.222743 |
Wide to long format#
no2_pivoted = no2.pivot(on='location', index="date.utc", values="value").sort('date.utc')
no2_pivoted.head(5)
shape: (5, 4)
date.utc | FR04014 | BETR801 | London Westminster |
---|---|---|---|
datetime[μs, UTC] | f64 | f64 | f64 |
2019-04-09 01:00:00 UTC | 24.4 | 22.5 | null |
2019-04-09 02:00:00 UTC | 27.4 | 53.5 | 67.0 |
2019-04-09 03:00:00 UTC | 34.2 | 54.5 | 67.0 |
2019-04-09 04:00:00 UTC | 48.5 | 34.5 | 41.0 |
2019-04-09 05:00:00 UTC | 59.5 | 46.5 | 41.0 |
# no_2 = no2_pivoted.melt(id_vars="date.utc")
no_2 = no2_pivoted.unpivot(index='date.utc', variable_name='location')
no_2.head(5)
shape: (5, 3)
date.utc | location | value |
---|---|---|
datetime[μs, UTC] | str | f64 |
2019-04-09 01:00:00 UTC | "FR04014" | 24.4 |
2019-04-09 02:00:00 UTC | "FR04014" | 27.4 |
2019-04-09 03:00:00 UTC | "FR04014" | 34.2 |
2019-04-09 04:00:00 UTC | "FR04014" | 48.5 |
2019-04-09 05:00:00 UTC | "FR04014" | 59.5 |
no_2 = no2_pivoted.unpivot(
on=["BETR801", "FR04014", "London Westminster"],
index='date.utc',
variable_name='id_location',
value_name='NO_2'
)
no_2.head(5)
shape: (5, 3)
date.utc | id_location | NO_2 |
---|---|---|
datetime[μs, UTC] | str | f64 |
2019-04-09 01:00:00 UTC | "BETR801" | 22.5 |
2019-04-09 02:00:00 UTC | "BETR801" | 53.5 |
2019-04-09 03:00:00 UTC | "BETR801" | 54.5 |
2019-04-09 04:00:00 UTC | "BETR801" | 34.5 |
2019-04-09 05:00:00 UTC | "BETR801" | 46.5 |
How to combine data from multiple tables#
air_quality_no2 = pl.read_csv("data/air_quality_no2_long.csv", try_parse_dates=True)
air_quality_no2 = air_quality_no2.select("date.utc", "location", "parameter", "value")
air_quality_pm25 = pl.read_csv("data/air_quality_pm25_long.csv", try_parse_dates=True)
air_quality_pm25 = air_quality_pm25.select("date.utc", "location", "parameter", "value")
Concatenating objects#
air_quality = pl.concat([air_quality_pm25, air_quality_no2], how='vertical')
air_quality
shape: (3_178, 4)
date.utc | location | parameter | value |
---|---|---|---|
datetime[μs, UTC] | str | str | f64 |
2019-06-18 06:00:00 UTC | "BETR801" | "pm25" | 18.0 |
2019-06-17 08:00:00 UTC | "BETR801" | "pm25" | 6.5 |
2019-06-17 07:00:00 UTC | "BETR801" | "pm25" | 18.5 |
2019-06-17 06:00:00 UTC | "BETR801" | "pm25" | 16.0 |
2019-06-17 05:00:00 UTC | "BETR801" | "pm25" | 7.5 |
… | … | … | … |
2019-05-07 06:00:00 UTC | "London Westminster" | "no2" | 26.0 |
2019-05-07 04:00:00 UTC | "London Westminster" | "no2" | 16.0 |
2019-05-07 03:00:00 UTC | "London Westminster" | "no2" | 19.0 |
2019-05-07 02:00:00 UTC | "London Westminster" | "no2" | 19.0 |
2019-05-07 01:00:00 UTC | "London Westminster" | "no2" | 23.0 |
print('Shape of the ``air_quality_pm25`` table: ', air_quality_pm25.shape)
print('Shape of the ``air_quality_no2`` table: ', air_quality_no2.shape)
print('Shape of the resulting ``air_quality`` table: ', air_quality.shape)
Shape of the ``air_quality_pm25`` table: (1110, 4)
Shape of the ``air_quality_no2`` table: (2068, 4)
Shape of the resulting ``air_quality`` table: (3178, 4)
air_quality = air_quality.sort("date.utc")
air_quality.head()
shape: (5, 4)
date.utc | location | parameter | value |
---|---|---|---|
datetime[μs, UTC] | str | str | f64 |
2019-05-07 01:00:00 UTC | "BETR801" | "pm25" | 12.5 |
2019-05-07 01:00:00 UTC | "London Westminster" | "pm25" | 8.0 |
2019-05-07 01:00:00 UTC | "FR04014" | "no2" | 25.0 |
2019-05-07 01:00:00 UTC | "BETR801" | "no2" | 50.5 |
2019-05-07 01:00:00 UTC | "London Westminster" | "no2" | 23.0 |
# air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"])
air_quality_ = pl.concat([
air_quality_pm25.with_columns(pl.lit('PM25').alias('key')),
air_quality_no2.with_columns(pl.lit('NO2').alias('key'))
]).select('key', pl.exclude('key'))
air_quality_
shape: (3_178, 5)
key | date.utc | location | parameter | value |
---|---|---|---|---|
str | datetime[μs, UTC] | str | str | f64 |
"PM25" | 2019-06-18 06:00:00 UTC | "BETR801" | "pm25" | 18.0 |
"PM25" | 2019-06-17 08:00:00 UTC | "BETR801" | "pm25" | 6.5 |
"PM25" | 2019-06-17 07:00:00 UTC | "BETR801" | "pm25" | 18.5 |
"PM25" | 2019-06-17 06:00:00 UTC | "BETR801" | "pm25" | 16.0 |
"PM25" | 2019-06-17 05:00:00 UTC | "BETR801" | "pm25" | 7.5 |
… | … | … | … | … |
"NO2" | 2019-05-07 06:00:00 UTC | "London Westminster" | "no2" | 26.0 |
"NO2" | 2019-05-07 04:00:00 UTC | "London Westminster" | "no2" | 16.0 |
"NO2" | 2019-05-07 03:00:00 UTC | "London Westminster" | "no2" | 19.0 |
"NO2" | 2019-05-07 02:00:00 UTC | "London Westminster" | "no2" | 19.0 |
"NO2" | 2019-05-07 01:00:00 UTC | "London Westminster" | "no2" | 23.0 |
def concat_with_keys(dfs, keys, key_column_name='key'):
dfs = [df.with_columns(pl.lit(key).alias(key_column_name))
for df, key in zip(dfs, keys)]
return pl.concat(dfs, how='vertical').select(key_column_name, pl.exclude(key_column_name))
air_quality_ = concat_with_keys([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"])
air_quality_
shape: (3_178, 5)
key | date.utc | location | parameter | value |
---|---|---|---|---|
str | datetime[μs, UTC] | str | str | f64 |
"PM25" | 2019-06-18 06:00:00 UTC | "BETR801" | "pm25" | 18.0 |
"PM25" | 2019-06-17 08:00:00 UTC | "BETR801" | "pm25" | 6.5 |
"PM25" | 2019-06-17 07:00:00 UTC | "BETR801" | "pm25" | 18.5 |
"PM25" | 2019-06-17 06:00:00 UTC | "BETR801" | "pm25" | 16.0 |
"PM25" | 2019-06-17 05:00:00 UTC | "BETR801" | "pm25" | 7.5 |
… | … | … | … | … |
"NO2" | 2019-05-07 06:00:00 UTC | "London Westminster" | "no2" | 26.0 |
"NO2" | 2019-05-07 04:00:00 UTC | "London Westminster" | "no2" | 16.0 |
"NO2" | 2019-05-07 03:00:00 UTC | "London Westminster" | "no2" | 19.0 |
"NO2" | 2019-05-07 02:00:00 UTC | "London Westminster" | "no2" | 19.0 |
"NO2" | 2019-05-07 01:00:00 UTC | "London Westminster" | "no2" | 23.0 |
Join tables using a common identifier#
stations_coord = pl.read_csv("data/air_quality_stations.csv")
stations_coord.head(5)
shape: (5, 3)
location | coordinates.latitude | coordinates.longitude |
---|---|---|
str | f64 | f64 |
"BELAL01" | 51.23619 | 4.38522 |
"BELHB23" | 51.1703 | 4.341 |
"BELLD01" | 51.10998 | 5.00486 |
"BELLD02" | 51.12038 | 5.02155 |
"BELR833" | 51.32766 | 4.36226 |
air_quality = air_quality.join(stations_coord, on='location', how='left')
air_quality.head(5)
shape: (5, 6)
date.utc | location | parameter | value | coordinates.latitude | coordinates.longitude |
---|---|---|---|---|---|
datetime[μs, UTC] | str | str | f64 | f64 | f64 |
2019-05-07 01:00:00 UTC | "BETR801" | "pm25" | 12.5 | 51.20966 | 4.43182 |
2019-05-07 01:00:00 UTC | "London Westminster" | "pm25" | 8.0 | 51.49467 | -0.13193 |
2019-05-07 01:00:00 UTC | "FR04014" | "no2" | 25.0 | 48.83724 | 2.3939 |
2019-05-07 01:00:00 UTC | "FR04014" | "no2" | 25.0 | 48.83722 | 2.3939 |
2019-05-07 01:00:00 UTC | "BETR801" | "no2" | 50.5 | 51.20966 | 4.43182 |
air_quality_parameters = pl.read_csv("data/air_quality_parameters.csv")
air_quality_parameters.head(5)
shape: (5, 3)
id | description | name |
---|---|---|
str | str | str |
"bc" | "Black Carbon" | "BC" |
"co" | "Carbon Monoxide" | "CO" |
"no2" | "Nitrogen Dioxide" | "NO2" |
"o3" | "Ozone" | "O3" |
"pm10" | "Particulate matter less than 1… | "PM10" |
# air_quality = pd.merge(air_quality, stations_coord, how="left", on="location")
air_quality = air_quality.join(air_quality_parameters, left_on='parameter', right_on='id')
air_quality.head()
shape: (5, 8)
date.utc | location | parameter | value | coordinates.latitude | coordinates.longitude | description | name |
---|---|---|---|---|---|---|---|
datetime[μs, UTC] | str | str | f64 | f64 | f64 | str | str |
2019-05-07 01:00:00 UTC | "BETR801" | "pm25" | 12.5 | 51.20966 | 4.43182 | "Particulate matter less than 2… | "PM2.5" |
2019-05-07 01:00:00 UTC | "London Westminster" | "pm25" | 8.0 | 51.49467 | -0.13193 | "Particulate matter less than 2… | "PM2.5" |
2019-05-07 01:00:00 UTC | "FR04014" | "no2" | 25.0 | 48.83724 | 2.3939 | "Nitrogen Dioxide" | "NO2" |
2019-05-07 01:00:00 UTC | "FR04014" | "no2" | 25.0 | 48.83722 | 2.3939 | "Nitrogen Dioxide" | "NO2" |
2019-05-07 01:00:00 UTC | "BETR801" | "no2" | 50.5 | 51.20966 | 4.43182 | "Nitrogen Dioxide" | "NO2" |
How to handle time series data with ease#
air_quality = pl.read_csv("data/air_quality_no2_long.csv")
air_quality = air_quality.rename({"date.utc": "datetime"})
air_quality.head()
shape: (5, 7)
city | country | datetime | location | parameter | value | unit |
---|---|---|---|---|---|---|
str | str | str | str | str | f64 | str |
"Paris" | "FR" | "2019-06-21 00:00:00+00:00" | "FR04014" | "no2" | 20.0 | "µg/m³" |
"Paris" | "FR" | "2019-06-20 23:00:00+00:00" | "FR04014" | "no2" | 21.8 | "µg/m³" |
"Paris" | "FR" | "2019-06-20 22:00:00+00:00" | "FR04014" | "no2" | 26.5 | "µg/m³" |
"Paris" | "FR" | "2019-06-20 21:00:00+00:00" | "FR04014" | "no2" | 24.9 | "µg/m³" |
"Paris" | "FR" | "2019-06-20 20:00:00+00:00" | "FR04014" | "no2" | 21.4 | "µg/m³" |
row(
air_quality['city'].unique(),
air_quality.select(pl.col('city').unique())
)
shape: (3,)
|
shape: (3, 1)
|
Using pandas datetime properties#
# air_quality["datetime"] = pd.to_datetime(air_quality["datetime"])
air_quality = air_quality.with_columns(
pl.col('datetime').str.to_datetime()
)
air_quality['datetime']
shape: (2_068,)
datetime |
---|
datetime[μs, UTC] |
2019-06-21 00:00:00 UTC |
2019-06-20 23:00:00 UTC |
2019-06-20 22:00:00 UTC |
2019-06-20 21:00:00 UTC |
2019-06-20 20:00:00 UTC |
… |
2019-05-07 06:00:00 UTC |
2019-05-07 04:00:00 UTC |
2019-05-07 03:00:00 UTC |
2019-05-07 02:00:00 UTC |
2019-05-07 01:00:00 UTC |
# air_quality["datetime"].min(), air_quality["datetime"].max()
air_quality.select(
min=pl.col('datetime').min(),
max=pl.col('datetime').max()
).row(0)
(datetime.datetime(2019, 5, 7, 1, 0, tzinfo=zoneinfo.ZoneInfo(key='UTC')),
datetime.datetime(2019, 6, 21, 0, 0, tzinfo=zoneinfo.ZoneInfo(key='UTC')))
# air_quality["datetime"].max() - air_quality["datetime"].min()
air_quality.select(
span=pl.col('datetime').max() - pl.col('datetime').min()
).item()
datetime.timedelta(days=44, seconds=82800)
# air_quality["month"] = air_quality["datetime"].dt.month
air_quality.with_columns(
month=pl.col('datetime').dt.month()
)
air_quality.head(5)
shape: (5, 7)
city | country | datetime | location | parameter | value | unit |
---|---|---|---|---|---|---|
str | str | datetime[μs, UTC] | str | str | f64 | str |
"Paris" | "FR" | 2019-06-21 00:00:00 UTC | "FR04014" | "no2" | 20.0 | "µg/m³" |
"Paris" | "FR" | 2019-06-20 23:00:00 UTC | "FR04014" | "no2" | 21.8 | "µg/m³" |
"Paris" | "FR" | 2019-06-20 22:00:00 UTC | "FR04014" | "no2" | 26.5 | "µg/m³" |
"Paris" | "FR" | 2019-06-20 21:00:00 UTC | "FR04014" | "no2" | 24.9 | "µg/m³" |
"Paris" | "FR" | 2019-06-20 20:00:00 UTC | "FR04014" | "no2" | 21.4 | "µg/m³" |
# air_quality.groupby(
# [air_quality["datetime"].dt.weekday, "location"])["value"].mean()
(
air_quality.group_by(
pl.col('datetime').dt.weekday().alias('weekday'), 'location'
)
.agg(
pl.col('value').mean()
)
.sort('weekday', 'location')
)
shape: (21, 3)
weekday | location | value |
---|---|---|
i8 | str | f64 |
1 | "BETR801" | 27.875 |
1 | "FR04014" | 24.85625 |
1 | "London Westminster" | 23.969697 |
2 | "BETR801" | 22.214286 |
2 | "FR04014" | 30.999359 |
… | … | … |
6 | "FR04014" | 25.266154 |
6 | "London Westminster" | 24.977612 |
7 | "BETR801" | 21.896552 |
7 | "FR04014" | 23.274306 |
7 | "London Westminster" | 24.859155 |
(
air_quality
.group_by(
pl.col('datetime').dt.hour().alias('hour')
)
.agg(
pl.col('value').mean()
)
.sort('hour')
.hvplot.bar(x='hour', y='value')
)
Datetime as index#
no_2 = air_quality.pivot(on='location', index='datetime', values='value').sort('datetime')
no_2.head(5)
shape: (5, 4)
datetime | FR04014 | BETR801 | London Westminster |
---|---|---|---|
datetime[μs, UTC] | f64 | f64 | f64 |
2019-05-07 01:00:00 UTC | 25.0 | 50.5 | 23.0 |
2019-05-07 02:00:00 UTC | 27.7 | 45.0 | 19.0 |
2019-05-07 03:00:00 UTC | 50.4 | null | 19.0 |
2019-05-07 04:00:00 UTC | 61.9 | null | 16.0 |
2019-05-07 05:00:00 UTC | 72.4 | null | null |
# no_2.index.year, no_2.index.weekday
row(
no_2['datetime'].dt.year(),
no_2['datetime'].dt.weekday()
)
shape: (1_033,)
|
shape: (1_033,)
|
no_2.filter(
pl.col('datetime').dt.date().is_between(
pl.date(2019, 5, 20),
pl.date(2019, 5, 21)
)
).hvplot.line(x='datetime')
Resample a time series to another frequency#
# monthly_max = no_2.resample("ME").max()
monthly_max = (
no_2.group_by(
pl.col('datetime').dt.date().dt.month_end()
)
.agg(
pl.all().max()
)
.sort('datetime')
)
monthly_max
shape: (2, 4)
datetime | FR04014 | BETR801 | London Westminster |
---|---|---|---|
date | f64 | f64 | f64 |
2019-05-31 | 97.0 | 74.5 | 97.0 |
2019-06-30 | 84.7 | 52.5 | 52.0 |
# no_2.resample("D").mean().plot(style="-o", figsize=(10, 5))
df = (
no_2
.group_by(
pl.col('datetime').dt.truncate('1d')
)
.agg(
pl.all().mean()
)
)
df.hvplot.line(x='datetime') * df.hvplot.scatter(x='datetime')
How to manipulate textual data#
titanic = pl.read_csv("data/titanic.csv")
# titanic["Name"].str.lower()
row(
titanic['Name'].str.to_lowercase(),
titanic.select(pl.col('Name').str.to_lowercase()).to_series()
)
shape: (891,)
|
shape: (891,)
|
# titanic["Name"].str.split(",")
row(
titanic['Name'].str.split(","),
titanic.select(pl.col('Name').str.split(",")).to_series()
)
shape: (891,)
|
shape: (891,)
|
# titanic["Surname"] = titanic["Name"].str.split(",").str.get(0)
titanic = titanic.with_columns(
Surname=pl.col('Name').str.split(",").list.get(0)
)
titanic['Surname']
shape: (891,)
Surname |
---|
str |
"Braund" |
"Cumings" |
"Heikkinen" |
"Futrelle" |
"Allen" |
… |
"Montvila" |
"Graham" |
"Johnston" |
"Behr" |
"Dooley" |
# titanic["Name"].str.contains("Countess")
row(
titanic["Name"].str.contains("Countess"),
titanic.select(pl.col('Name').str.contains("Countess")).to_series()
)
shape: (891,)
|
shape: (891,)
|
# titanic[titanic["Name"].str.contains("Countess")]
titanic.filter(
pl.col('Name').str.contains("Countess")
)
shape: (1, 13)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Surname |
---|---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str | str |
760 | 1 | 1 | "Rothes, the Countess. of (Lucy… | "female" | 33.0 | 0 | 0 | "110152" | 86.5 | "B77" | "S" | "Rothes" |
# titanic["Name"].str.len()
row(
titanic["Name"].str.len_chars(),
titanic.select(pl.col('Name').str.len_chars()).to_series()
)
shape: (891,)
|
shape: (891,)
|
# titanic["Name"].str.len().idxmax()
titanic.select(pl.col('Name').str.len_chars().arg_max()).item()
307
# titanic.loc[titanic["Name"].str.len().idxmax(), "Name"]
titanic.row(
titanic.select(pl.col('Name').str.len_chars().arg_max()).item(), named=True
)['Name']
'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'
titanic.select(
pl.col('Name').get(pl.col('Name').str.len_chars().arg_max())
).item()
'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'
# titanic["Sex_short"] = titanic["Sex"].replace({"male": "M", "female": "F"})
titanic = titanic.with_columns(
Sex_short=pl.col('Sex').replace({"male": "M", "female": "F"})
)
titanic['Sex_short']
shape: (891,)
Sex_short |
---|
str |
"M" |
"F" |
"F" |
"F" |
"M" |
… |
"M" |
"F" |
"F" |
"M" |
"M" |