Getting started tutorials#

This notebook contains code that reimplements the examples from the Pandas Getting Started documentation using Polars: https://pandas.pydata.org/docs/getting_started/. I hope this notebook helps you transition from Pandas to Polars more easily. Be mindful that Polars and Pandas have different philosophies, so there are some tasks that are easy to accomplish in Pandas but may require more effort in Polars.

from helper.jupyter import row
import polars as pl
from polars import selectors as cs
import holoviews as hv
import hvplot.polars # need import this to plot with hvplot

What kind of data does pandas handle?#

df = pl.DataFrame(
    {
        "Name": [
            "Braund, Mr. Owen Harris",
            "Allen, Mr. William Henry",
            "Bonnell, Miss. Elizabeth",
        ],
        "Age": [22, 35, 58],
        "Sex": ["male", "male", "female"],
    }
)
df
shape: (3, 3)
NameAgeSex
stri64str
"Braund, Mr. Owen Harris"22"male"
"Allen, Mr. William Henry"35"male"
"Bonnell, Miss. Elizabeth"58"female"
# df["Age"]
row(
    df['Age'],
    df.get_column('Age')
)
shape: (3,)
Age
i64
22
35
58
shape: (3,)
Age
i64
22
35
58
ages = pl.Series('Age', [22, 35, 58])
ages
shape: (3,)
Age
i64
22
35
58
df['Age'].max()
58
ages.max()
58
df.select(pl.col('Age').max()).item()
58
df.describe()
shape: (9, 4)
statisticNameAgeSex
strstrf64str
"count""3"3.0"3"
"null_count""0"0.0"0"
"mean"null38.333333null
"std"null18.230012null
"min""Allen, Mr. William Henry"22.0"female"
"25%"null35.0null
"50%"null35.0null
"75%"null58.0null
"max""Braund, Mr. Owen Harris"58.0"male"

How do I read and write tabular data?#

titanic = pl.read_csv('data/titanic.csv')
titanic.head(3)
shape: (3, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"Braund, Mr. Owen Harris""male"22.010"A/5 21171"7.25null"S"
211"Cumings, Mrs. John Bradley (Fl…"female"38.010"PC 17599"71.2833"C85""C"
313"Heikkinen, Miss Laina""female"26.000"STON/O2. 3101282"7.925null"S"
titanic.tail(3)
shape: (3, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
88903"Johnston, Miss Catherine Helen…"female"null12"W./C. 6607"23.45null"S"
89011"Behr, Mr. Karl Howell""male"26.000"111369"30.0"C148""C"
89103"Dooley, Mr. Patrick""male"32.000"370376"7.75null"Q"
titanic.dtypes
[Int64,
 Int64,
 Int64,
 String,
 String,
 Float64,
 Int64,
 Int64,
 String,
 Float64,
 String,
 String]
titanic.schema
Schema([('PassengerId', Int64),
        ('Survived', Int64),
        ('Pclass', Int64),
        ('Name', String),
        ('Sex', String),
        ('Age', Float64),
        ('SibSp', Int64),
        ('Parch', Int64),
        ('Ticket', String),
        ('Fare', Float64),
        ('Cabin', String),
        ('Embarked', String)])
titanic.write_excel('data/titanic.xlsx', worksheet='passengers');
titanic = pl.read_excel('data/titanic.xlsx', sheet_name='passengers')
# titanic.info()
titanic.glimpse()
Rows: 891
Columns: 12
$ PassengerId <i64> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ Survived    <i64> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1
$ Pclass      <i64> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2
$ Name        <str> 'Braund, Mr. Owen Harris', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'Heikkinen, Miss Laina', 'Futrelle, Mrs. Jacques Heath (Lily May Peel)', 'Allen, Mr. William Henry', 'Moran, Mr. James', 'McCarthy, Mr. Timothy J', 'Palsson, Master Gosta Leonard', 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)', 'Nasser, Mrs. Nicholas (Adele Achem)'
$ Sex         <str> 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female'
$ Age         <f64> 22.0, 38.0, 26.0, 35.0, 35.0, None, 54.0, 2.0, 27.0, 14.0
$ SibSp       <i64> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1
$ Parch       <i64> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0
$ Ticket      <str> 'A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450', '330877', '17463', '349909', '347742', '237736'
$ Fare        <f64> 7.25, 71.2833, 7.925, 53.1, 8.05, 8.4583, 51.8625, 21.075, 11.1333, 30.0708
$ Cabin       <str> None, 'C85', None, 'C123', None, None, 'E46', None, None, None
$ Embarked    <str> 'S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C'
titanic.estimated_size()
85640
titanic.null_count()
shape: (1, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
u32u32u32u32u32u32u32u32u32u32u32u32
0000017700006872
titanic.n_unique()
891
titanic.select(pl.all().n_unique())
shape: (1, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
u32u32u32u32u32u32u32u32u32u32u32u32
89123891289776812481484

How do I select a subset of a DataFrame?#

How do I select specific columns from a DataFrame?#

age = titanic['Age']
age = titanic.get_column('Age') # or this
age.head(5)
shape: (5,)
Age
f64
22.0
38.0
26.0
35.0
35.0
type(age)
polars.series.series.Series
age.shape
(891,)
age_sex = titanic[['Age', 'Sex']]
age_sex = titanic.select('Age', 'Sex') #or this
age_sex.head(5)
shape: (5, 2)
AgeSex
f64str
22.0"male"
38.0"female"
26.0"female"
35.0"female"
35.0"male"
type(age_sex)
polars.dataframe.frame.DataFrame
age_sex.shape
(891, 2)

How do I filter specific rows from a DataFrame?#

# above_35 = titanic[titanic["Age"] > 35]
above_35 = titanic.filter(pl.col('Age') > 35)
above_35.head(5)
shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
211"Cumings, Mrs. John Bradley (Fl…"female"38.010"PC 17599"71.2833"C85""C"
701"McCarthy, Mr. Timothy J""male"54.000"17463"51.8625"E46""S"
1211"Bonnell, Miss Elizabeth""female"58.000"113783"26.55"C103""S"
1403"Andersson, Mr. Anders Johan""male"39.015"347082"31.275null"S"
1612"Hewlett, Mrs. (Mary D Kingcome…"female"55.000"248706"16.0null"S"
# titanic["Age"] > 35
row(
    titanic['Age'] > 35,
    titanic.select(pl.col('Age') > 35),
    titanic.select(pl.col('Age') > 35).to_series()
)
shape: (891,)
Age
bool
false
true
false
false
false
false
false
null
false
false
shape: (891, 1)
Age
bool
false
true
false
false
false
false
false
null
false
false
shape: (891,)
Age
bool
false
true
false
false
false
false
false
null
false
false
above_35.shape
(217, 12)
# class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23 = titanic.filter(pl.col('Pclass').is_in([2, 3]))
class_23.head(5)
shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"Braund, Mr. Owen Harris""male"22.010"A/5 21171"7.25null"S"
313"Heikkinen, Miss Laina""female"26.000"STON/O2. 3101282"7.925null"S"
503"Allen, Mr. William Henry""male"35.000"373450"8.05null"S"
603"Moran, Mr. James""male"null00"330877"8.4583null"Q"
803"Palsson, Master Gosta Leonard""male"2.031"349909"21.075null"S"
# class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
class_23 = titanic.filter(
    (pl.col('Pclass') == 2) | (pl.col('Pclass') == 3)
)

class_23 = titanic.filter(
    pl.col('Pclass').eq(2).or_(pl.col('Pclass').eq(3))
)
# age_no_na = titanic[titanic["Age"].notna()]
age_no_na = titanic.filter(pl.col('Age').is_not_null())
age_no_na.head()
shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"Braund, Mr. Owen Harris""male"22.010"A/5 21171"7.25null"S"
211"Cumings, Mrs. John Bradley (Fl…"female"38.010"PC 17599"71.2833"C85""C"
313"Heikkinen, Miss Laina""female"26.000"STON/O2. 3101282"7.925null"S"
411"Futrelle, Mrs. Jacques Heath (…"female"35.010"113803"53.1"C123""S"
503"Allen, Mr. William Henry""male"35.000"373450"8.05null"S"
age_no_na.shape
(714, 12)

How do I select specific rows and columns from a DataFrame?#

# adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
adult_names = (
    titanic
    .filter(pl.col('Age') > 35)
    .select('Name')
    .to_series()
)
adult_names.head(5)
shape: (5,)
Name
str
"Cumings, Mrs. John Bradley (Fl…
"McCarthy, Mr. Timothy J"
"Bonnell, Miss Elizabeth"
"Andersson, Mr. Anders Johan"
"Hewlett, Mrs. (Mary D Kingcome…
# titanic.iloc[9:25, 2:5]
titanic.select(pl.nth(range(2, 5)).slice(9, 25 - 9))
shape: (16, 3)
PclassNameSex
i64strstr
2"Nasser, Mrs. Nicholas (Adele A…"female"
3"Sandstrom, Miss Marguerite Rut""female"
1"Bonnell, Miss Elizabeth""female"
3"Saundercock, Mr. William Henry""male"
3"Andersson, Mr. Anders Johan""male"
2"Fynney, Mr. Joseph J""male"
2"Beesley, Mr. Lawrence""male"
3"McGowan, Miss Anna "Annie"""female"
1"Sloper, Mr. William Thompson""male"
3"Palsson, Miss Torborg Danira""female"
# titanic.iloc[0:3, 3] = "anonymous"
titanic = titanic.with_columns(
    pl.when(pl.int_range(pl.len()) < 3)
    .then(pl.lit('anonymous'))
    .otherwise(pl.nth(3))
    .name.keep()
)
titanic.head(5)
shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"anonymous""male"22.010"A/5 21171"7.25null"S"
211"anonymous""female"38.010"PC 17599"71.2833"C85""C"
313"anonymous""female"26.000"STON/O2. 3101282"7.925null"S"
411"Futrelle, Mrs. Jacques Heath (…"female"35.010"113803"53.1"C123""S"
503"Allen, Mr. William Henry""male"35.000"373450"8.05null"S"

How do I create plots in pandas?#

air_quality = pl.read_csv("data/air_quality_no2.csv", try_parse_dates=True)
air_quality.head(5)
shape: (5, 4)
datetimestation_antwerpstation_parisstation_london
datetime[μs]f64f64f64
2019-05-07 02:00:00nullnull23.0
2019-05-07 03:00:0050.525.019.0
2019-05-07 04:00:0045.027.719.0
2019-05-07 05:00:00null50.416.0
2019-05-07 06:00:00null61.9null
air_quality.hvplot.line(x='datetime')
air_quality.hvplot.line(x='datetime', y='station_paris')
air_quality.hvplot.scatter(x="station_london", y="station_paris", alpha=0.5)
[
    method_name
    for method_name in dir(air_quality.hvplot)
    if not method_name.startswith("_")
]
['area',
 'bar',
 'barh',
 'bivariate',
 'box',
 'dataset',
 'density',
 'errorbars',
 'explorer',
 'heatmap',
 'hexbin',
 'hist',
 'kde',
 'labels',
 'line',
 'ohlc',
 'paths',
 'points',
 'polygons',
 'scatter',
 'step',
 'table',
 'vectorfield',
 'violin']
air_quality.hvplot.box()
# air_quality.plot.area(figsize=(12, 4), subplots=True)
(
air_quality
.unpivot(index="datetime", variable_name="station", value_name="quality")
.hvplot.area(x='datetime', y='quality', groupby="station")
.opts(height=180)
.layout()
.cols(1)
)
fig = (
air_quality
.unpivot(index="datetime", variable_name="station", value_name="quality")
.with_columns(
    pl.col('quality').fill_null(0)
)
.hvplot.area(x='datetime', y='quality', groupby='station')
.redim(quality=r"NO2 concentration")
.opts(alpha=0.8)
.overlay()
)
fig
hv.save(fig, 'data/no2_concentrations.html')

How to create new columns derived from existing columns#

air_quality = pl.read_csv("data/air_quality_no2.csv", try_parse_dates=True)
air_quality.head(5)
shape: (5, 4)
datetimestation_antwerpstation_parisstation_london
datetime[μs]f64f64f64
2019-05-07 02:00:00nullnull23.0
2019-05-07 03:00:0050.525.019.0
2019-05-07 04:00:0045.027.719.0
2019-05-07 05:00:00null50.416.0
2019-05-07 06:00:00null61.9null
# air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.88
air_quality = air_quality.with_columns(
    london_mg_per_cubic = pl.col("station_london") * 1.882
)
# air_quality["ratio_paris_antwerp"] = (
#    air_quality["station_paris"] / air_quality["station_antwerp"]
# )

air_quality = air_quality.with_columns(
    ratio_paris_antwerp = pl.col("station_paris") / pl.col("station_antwerp")
)
air_quality_renamed = air_quality.rename(
    {
        "station_antwerp": "BETR801",
        "station_paris": "FR04014",
        "station_london": "London Westminster",
    }
)
air_quality_renamed.head(5)
shape: (5, 6)
datetimeBETR801FR04014London Westminsterlondon_mg_per_cubicratio_paris_antwerp
datetime[μs]f64f64f64f64f64
2019-05-07 02:00:00nullnull23.043.286null
2019-05-07 03:00:0050.525.019.035.7580.49505
2019-05-07 04:00:0045.027.719.035.7580.615556
2019-05-07 05:00:00null50.416.030.112null
2019-05-07 06:00:00null61.9nullnullnull
air_quality_renamed = air_quality_renamed.rename(str.lower)
air_quality_renamed.head(5)
shape: (5, 6)
datetimebetr801fr04014london westminsterlondon_mg_per_cubicratio_paris_antwerp
datetime[μs]f64f64f64f64f64
2019-05-07 02:00:00nullnull23.043.286null
2019-05-07 03:00:0050.525.019.035.7580.49505
2019-05-07 04:00:0045.027.719.035.7580.615556
2019-05-07 05:00:00null50.416.030.112null
2019-05-07 06:00:00null61.9nullnullnull

How to calculate summary statistics#

Aggregating statistics#

titanic = pl.read_csv("data/titanic.csv")
(
    titanic['Age'].mean(),
    titanic.select(pl.col('Age').mean()).item() #or this
)
(29.69911764705882, 29.69911764705882)
res1 = titanic[["Age", "Fare"]].median()
res2 = titanic.select(pl.col('Age', 'Fare').median())
row(res1, res2)
shape: (1, 2)
AgeFare
f64f64
28.014.4542
shape: (1, 2)
AgeFare
f64f64
28.014.4542
res1.transpose(include_header=True, column_names=['Value'])
shape: (2, 2)
columnValue
strf64
"Age"28.0
"Fare"14.4542
titanic[["Age", "Fare"]].describe()
shape: (9, 3)
statisticAgeFare
strf64f64
"count"714.0891.0
"null_count"177.00.0
"mean"29.69911832.204208
"std"14.52649749.693429
"min"0.420.0
"25%"20.07.925
"50%"28.014.4542
"75%"38.031.0
"max"80.0512.3292
# titanic.agg(
#    {
#        "Age": ["min", "max", "median", "skew"],
#        "Fare": ["min", "max", "median", "mean"],
#    }
# )

age, fare = [pl.col(name) for name in ['Age', 'Fare']]
(
titanic
.select(
    Age=pl.struct(
        min=age.min(),
        max=age.max(),
        median=age.median(),
        skew=age.skew()
    ),
    Fare=pl.struct(
        min=fare.min(),
        max=fare.max(),
        median=fare.median(),
        mean=fare.mean()
    )
)
.transpose(include_header=True, column_names=['agg'])
.unnest('agg')
)
shape: (2, 6)
columnminmaxmedianskewmean
strf64f64f64f64f64
"Age"0.4280.028.00.38829null
"Fare"0.0512.329214.4542null32.204208
def agg(df, items):
    exprs = {
        key: pl.struct(**{agg_name: getattr(pl.col(key), agg_name)() for agg_name in agg_names})
        for key, agg_names in items.items()
    }
    return df.select(**exprs).transpose(include_header=True, column_names=["agg"]).unnest("agg")


agg(
    titanic,
    {
        "Age": ["min", "max", "median", "skew"],
        "Fare": ["min", "max", "median", "mean"],
    },
)
shape: (2, 6)
columnminmaxmedianskewmean
strf64f64f64f64f64
"Age"0.4280.028.00.38829null
"Fare"0.0512.329214.4542null32.204208

Aggregating statistics grouped by category#

# titanic[["Sex", "Age"]].groupby("Sex").mean()
titanic.group_by('Sex').agg(pl.col('Age').mean())
shape: (2, 2)
SexAge
strf64
"female"27.915709
"male"30.726645
# titanic.groupby("Sex").mean(numeric_only=True)

from polars import selectors as cs
titanic.group_by('Sex').agg(cs.numeric().mean())
shape: (2, 8)
SexPassengerIdSurvivedPclassAgeSibSpParchFare
strf64f64f64f64f64f64f64
"female"431.0286620.7420382.15923627.9157090.6942680.64968244.479818
"male"454.1473140.1889082.38994830.7266450.4298090.23570225.523893
# titanic.groupby(["Sex", "Pclass"])["Fare"].mean()
(
titanic
.group_by('Sex', 'Pclass')
.agg(pl.col('Fare').mean())
.sort('Sex', 'Pclass')
)
shape: (6, 3)
SexPclassFare
stri64f64
"female"1106.125798
"female"221.970121
"female"316.11881
"male"167.226127
"male"219.741782
"male"312.661633

Count number of records by category#

# titanic["Pclass"].value_counts()
res1 = titanic['Pclass'].value_counts().sort('count', descending=True)
res2 = (
titanic
.select(
    pl.col('Pclass')
      .value_counts()
      .struct.unnest()
)
.sort('count', descending=True)
)
row(res1, res2)
shape: (3, 2)
Pclasscount
i64u32
3491
1216
2184
shape: (3, 2)
Pclasscount
i64u32
3491
1216
2184
# titanic.groupby("Pclass")["Pclass"].count()
titanic.group_by('Pclass').agg(count=pl.len())
shape: (3, 2)
Pclasscount
i64u32
3491
2184
1216

How to reshape the layout of tables#

titanic = pl.read_csv("data/titanic.csv")
air_quality = pl.read_csv("data/air_quality_long.csv", try_parse_dates=True)

Sort table rows#

titanic.sort('Age', nulls_last=True).head(5)
shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
80413"Thomas, Master Assad Alexander""male"0.4201"2625"8.5167null"C"
75612"Hamalainen, Master Viljo""male"0.6711"250649"14.5null"S"
47013"Baclini, Miss Helene Barbara""female"0.7521"2666"19.2583null"C"
64513"Baclini, Miss Eugenie""female"0.7521"2666"19.2583null"C"
7912"Caldwell, Master Alden Gates""male"0.8302"248738"29.0null"S"
titanic.sort('Pclass', 'Age', descending=True, nulls_last=True).head()
shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
85203"Svensson, Mr. Johan""male"74.000"347060"7.775null"S"
11703"Connors, Mr. Patrick""male"70.500"370369"7.75null"Q"
28103"Duane, Mr. Frank""male"65.000"336439"7.75null"Q"
48413"Turkula, Mrs. (Hedwig)""female"63.000"4134"9.5875null"S"
32703"Nysveen, Mr. Johan Hansen""male"61.000"345364"6.2375null"S"

Long to wide table format#

no2 = air_quality.filter(pl.col('parameter') == 'no2')
# no2_subset = no2.sort_index().groupby(["location"]).head(2)
no2_subset = (
no2
.sort('date.utc')
.group_by('location')
.agg(pl.all().head(2))
.explode(pl.exclude('location'))
)
no2_subset
shape: (6, 7)
locationcitycountrydate.utcparametervalueunit
strstrstrdatetime[μs, UTC]strf64str
"BETR801""Antwerpen""BE"2019-04-09 01:00:00 UTC"no2"22.5"µg/m³"
"BETR801""Antwerpen""BE"2019-04-09 02:00:00 UTC"no2"53.5"µg/m³"
"FR04014""Paris""FR"2019-04-09 01:00:00 UTC"no2"24.4"µg/m³"
"FR04014""Paris""FR"2019-04-09 02:00:00 UTC"no2"27.4"µg/m³"
"London Westminster""London""GB"2019-04-09 02:00:00 UTC"no2"67.0"µg/m³"
"London Westminster""London""GB"2019-04-09 03:00:00 UTC"no2"67.0"µg/m³"
no2_subset.pivot(on="location", index='date.utc', values='value')
shape: (3, 4)
date.utcBETR801FR04014London Westminster
datetime[μs, UTC]f64f64f64
2019-04-09 01:00:00 UTC22.524.4null
2019-04-09 02:00:00 UTC53.527.467.0
2019-04-09 03:00:00 UTCnullnull67.0
no2.head(5)
shape: (5, 7)
citycountrydate.utclocationparametervalueunit
strstrdatetime[μs, UTC]strstrf64str
"Paris""FR"2019-06-21 00:00:00 UTC"FR04014""no2"20.0"µg/m³"
"Paris""FR"2019-06-20 23:00:00 UTC"FR04014""no2"21.8"µg/m³"
"Paris""FR"2019-06-20 22:00:00 UTC"FR04014""no2"26.5"µg/m³"
"Paris""FR"2019-06-20 21:00:00 UTC"FR04014""no2"24.9"µg/m³"
"Paris""FR"2019-06-20 20:00:00 UTC"FR04014""no2"21.4"µg/m³"
(
no2
.sort('date.utc')
.pivot(on='location', index='date.utc', values='value')
.hvplot.line(x="date.utc")
)

Pivot table#

air_quality.pivot(
    on='parameter', index='location', values='value', aggregate_function='mean'
)
shape: (3, 3)
locationpm25no2
strf64f64
"BETR801"23.16949226.95092
"London Westminster"13.44356829.74005
"FR04014"null29.374284
air_quality.pivot(
    on='parameter', index='location', values='value', aggregate_function='mean'
)
shape: (3, 3)
locationpm25no2
strf64f64
"BETR801"23.16949226.95092
"London Westminster"13.44356829.74005
"FR04014"null29.374284
# air_quality.pivot_table(
#     values="value",
#     index="location",
#     columns="parameter",
#     aggfunc="mean",
#     margins=True,
# )

def pivot_with_margins(df, on, index, expr):
    all_on = pl.lit('All').alias(on)
    all_index = pl.lit('All').alias(index)

    return (
        pl.concat([
            df.group_by(index, on).agg(expr),
            df.group_by(index).agg(all_on, expr),
            df.group_by(on).agg(all_index, expr),
            df.select(all_on, all_index, expr),
        ], how='diagonal')
         .pivot(on=on, index=index)
    )

pivot_with_margins(
    air_quality, 
    on='parameter', 
    index='location', 
    expr=pl.col('value').mean()
)
shape: (4, 4)
locationno2pm25All
strf64f64f64
"FR04014"29.374284null29.374284
"BETR801"26.9509223.16949224.982353
"London Westminster"29.7400513.44356821.491708
"All"29.43031614.38684924.222743

Wide to long format#

no2_pivoted = no2.pivot(on='location', index="date.utc", values="value").sort('date.utc')
no2_pivoted.head(5)
shape: (5, 4)
date.utcFR04014BETR801London Westminster
datetime[μs, UTC]f64f64f64
2019-04-09 01:00:00 UTC24.422.5null
2019-04-09 02:00:00 UTC27.453.567.0
2019-04-09 03:00:00 UTC34.254.567.0
2019-04-09 04:00:00 UTC48.534.541.0
2019-04-09 05:00:00 UTC59.546.541.0
# no_2 = no2_pivoted.melt(id_vars="date.utc")
no_2 = no2_pivoted.unpivot(index='date.utc', variable_name='location')
no_2.head(5)
shape: (5, 3)
date.utclocationvalue
datetime[μs, UTC]strf64
2019-04-09 01:00:00 UTC"FR04014"24.4
2019-04-09 02:00:00 UTC"FR04014"27.4
2019-04-09 03:00:00 UTC"FR04014"34.2
2019-04-09 04:00:00 UTC"FR04014"48.5
2019-04-09 05:00:00 UTC"FR04014"59.5
no_2 = no2_pivoted.unpivot(
    on=["BETR801", "FR04014", "London Westminster"],
    index='date.utc',
    variable_name='id_location',
    value_name='NO_2'
)
no_2.head(5)
shape: (5, 3)
date.utcid_locationNO_2
datetime[μs, UTC]strf64
2019-04-09 01:00:00 UTC"BETR801"22.5
2019-04-09 02:00:00 UTC"BETR801"53.5
2019-04-09 03:00:00 UTC"BETR801"54.5
2019-04-09 04:00:00 UTC"BETR801"34.5
2019-04-09 05:00:00 UTC"BETR801"46.5

How to combine data from multiple tables#

air_quality_no2 = pl.read_csv("data/air_quality_no2_long.csv", try_parse_dates=True)
air_quality_no2 = air_quality_no2.select("date.utc", "location", "parameter", "value")

air_quality_pm25 = pl.read_csv("data/air_quality_pm25_long.csv", try_parse_dates=True)
air_quality_pm25 = air_quality_pm25.select("date.utc", "location", "parameter", "value")

Concatenating objects#

air_quality = pl.concat([air_quality_pm25, air_quality_no2], how='vertical')
air_quality
shape: (3_178, 4)
date.utclocationparametervalue
datetime[μs, UTC]strstrf64
2019-06-18 06:00:00 UTC"BETR801""pm25"18.0
2019-06-17 08:00:00 UTC"BETR801""pm25"6.5
2019-06-17 07:00:00 UTC"BETR801""pm25"18.5
2019-06-17 06:00:00 UTC"BETR801""pm25"16.0
2019-06-17 05:00:00 UTC"BETR801""pm25"7.5
2019-05-07 06:00:00 UTC"London Westminster""no2"26.0
2019-05-07 04:00:00 UTC"London Westminster""no2"16.0
2019-05-07 03:00:00 UTC"London Westminster""no2"19.0
2019-05-07 02:00:00 UTC"London Westminster""no2"19.0
2019-05-07 01:00:00 UTC"London Westminster""no2"23.0
print('Shape of the ``air_quality_pm25`` table: ', air_quality_pm25.shape)
print('Shape of the ``air_quality_no2`` table: ', air_quality_no2.shape)
print('Shape of the resulting ``air_quality`` table: ', air_quality.shape)
Shape of the ``air_quality_pm25`` table:  (1110, 4)
Shape of the ``air_quality_no2`` table:  (2068, 4)
Shape of the resulting ``air_quality`` table:  (3178, 4)
air_quality = air_quality.sort("date.utc")
air_quality.head()
shape: (5, 4)
date.utclocationparametervalue
datetime[μs, UTC]strstrf64
2019-05-07 01:00:00 UTC"BETR801""pm25"12.5
2019-05-07 01:00:00 UTC"London Westminster""pm25"8.0
2019-05-07 01:00:00 UTC"FR04014""no2"25.0
2019-05-07 01:00:00 UTC"BETR801""no2"50.5
2019-05-07 01:00:00 UTC"London Westminster""no2"23.0
# air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"])
air_quality_ = pl.concat([
    air_quality_pm25.with_columns(pl.lit('PM25').alias('key')),
    air_quality_no2.with_columns(pl.lit('NO2').alias('key'))
]).select('key', pl.exclude('key'))
air_quality_
shape: (3_178, 5)
keydate.utclocationparametervalue
strdatetime[μs, UTC]strstrf64
"PM25"2019-06-18 06:00:00 UTC"BETR801""pm25"18.0
"PM25"2019-06-17 08:00:00 UTC"BETR801""pm25"6.5
"PM25"2019-06-17 07:00:00 UTC"BETR801""pm25"18.5
"PM25"2019-06-17 06:00:00 UTC"BETR801""pm25"16.0
"PM25"2019-06-17 05:00:00 UTC"BETR801""pm25"7.5
"NO2"2019-05-07 06:00:00 UTC"London Westminster""no2"26.0
"NO2"2019-05-07 04:00:00 UTC"London Westminster""no2"16.0
"NO2"2019-05-07 03:00:00 UTC"London Westminster""no2"19.0
"NO2"2019-05-07 02:00:00 UTC"London Westminster""no2"19.0
"NO2"2019-05-07 01:00:00 UTC"London Westminster""no2"23.0
def concat_with_keys(dfs, keys, key_column_name='key'):
    dfs = [df.with_columns(pl.lit(key).alias(key_column_name)) 
           for df, key in zip(dfs, keys)]
    return pl.concat(dfs, how='vertical').select(key_column_name, pl.exclude(key_column_name))

air_quality_ = concat_with_keys([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"])
air_quality_
shape: (3_178, 5)
keydate.utclocationparametervalue
strdatetime[μs, UTC]strstrf64
"PM25"2019-06-18 06:00:00 UTC"BETR801""pm25"18.0
"PM25"2019-06-17 08:00:00 UTC"BETR801""pm25"6.5
"PM25"2019-06-17 07:00:00 UTC"BETR801""pm25"18.5
"PM25"2019-06-17 06:00:00 UTC"BETR801""pm25"16.0
"PM25"2019-06-17 05:00:00 UTC"BETR801""pm25"7.5
"NO2"2019-05-07 06:00:00 UTC"London Westminster""no2"26.0
"NO2"2019-05-07 04:00:00 UTC"London Westminster""no2"16.0
"NO2"2019-05-07 03:00:00 UTC"London Westminster""no2"19.0
"NO2"2019-05-07 02:00:00 UTC"London Westminster""no2"19.0
"NO2"2019-05-07 01:00:00 UTC"London Westminster""no2"23.0

Join tables using a common identifier#

stations_coord = pl.read_csv("data/air_quality_stations.csv")
stations_coord.head(5)
shape: (5, 3)
locationcoordinates.latitudecoordinates.longitude
strf64f64
"BELAL01"51.236194.38522
"BELHB23"51.17034.341
"BELLD01"51.109985.00486
"BELLD02"51.120385.02155
"BELR833"51.327664.36226
air_quality = air_quality.join(stations_coord, on='location', how='left')
air_quality.head(5)
shape: (5, 6)
date.utclocationparametervaluecoordinates.latitudecoordinates.longitude
datetime[μs, UTC]strstrf64f64f64
2019-05-07 01:00:00 UTC"BETR801""pm25"12.551.209664.43182
2019-05-07 01:00:00 UTC"London Westminster""pm25"8.051.49467-0.13193
2019-05-07 01:00:00 UTC"FR04014""no2"25.048.837242.3939
2019-05-07 01:00:00 UTC"FR04014""no2"25.048.837222.3939
2019-05-07 01:00:00 UTC"BETR801""no2"50.551.209664.43182
air_quality_parameters = pl.read_csv("data/air_quality_parameters.csv")
air_quality_parameters.head(5)
shape: (5, 3)
iddescriptionname
strstrstr
"bc""Black Carbon""BC"
"co""Carbon Monoxide""CO"
"no2""Nitrogen Dioxide""NO2"
"o3""Ozone""O3"
"pm10""Particulate matter less than 1…"PM10"
# air_quality = pd.merge(air_quality, stations_coord, how="left", on="location")
air_quality = air_quality.join(air_quality_parameters, left_on='parameter', right_on='id')
air_quality.head()
shape: (5, 8)
date.utclocationparametervaluecoordinates.latitudecoordinates.longitudedescriptionname
datetime[μs, UTC]strstrf64f64f64strstr
2019-05-07 01:00:00 UTC"BETR801""pm25"12.551.209664.43182"Particulate matter less than 2…"PM2.5"
2019-05-07 01:00:00 UTC"London Westminster""pm25"8.051.49467-0.13193"Particulate matter less than 2…"PM2.5"
2019-05-07 01:00:00 UTC"FR04014""no2"25.048.837242.3939"Nitrogen Dioxide""NO2"
2019-05-07 01:00:00 UTC"FR04014""no2"25.048.837222.3939"Nitrogen Dioxide""NO2"
2019-05-07 01:00:00 UTC"BETR801""no2"50.551.209664.43182"Nitrogen Dioxide""NO2"

How to handle time series data with ease#

air_quality = pl.read_csv("data/air_quality_no2_long.csv")
air_quality = air_quality.rename({"date.utc": "datetime"})
air_quality.head()
shape: (5, 7)
citycountrydatetimelocationparametervalueunit
strstrstrstrstrf64str
"Paris""FR""2019-06-21 00:00:00+00:00""FR04014""no2"20.0"µg/m³"
"Paris""FR""2019-06-20 23:00:00+00:00""FR04014""no2"21.8"µg/m³"
"Paris""FR""2019-06-20 22:00:00+00:00""FR04014""no2"26.5"µg/m³"
"Paris""FR""2019-06-20 21:00:00+00:00""FR04014""no2"24.9"µg/m³"
"Paris""FR""2019-06-20 20:00:00+00:00""FR04014""no2"21.4"µg/m³"
row(
    air_quality['city'].unique(),
    air_quality.select(pl.col('city').unique())
)
shape: (3,)
city
str
"Antwerpen"
"London"
"Paris"
shape: (3, 1)
city
str
"Antwerpen"
"London"
"Paris"

Using pandas datetime properties#

# air_quality["datetime"] = pd.to_datetime(air_quality["datetime"])
air_quality = air_quality.with_columns(
    pl.col('datetime').str.to_datetime()
)
air_quality['datetime']
shape: (2_068,)
datetime
datetime[μs, UTC]
2019-06-21 00:00:00 UTC
2019-06-20 23:00:00 UTC
2019-06-20 22:00:00 UTC
2019-06-20 21:00:00 UTC
2019-06-20 20:00:00 UTC
2019-05-07 06:00:00 UTC
2019-05-07 04:00:00 UTC
2019-05-07 03:00:00 UTC
2019-05-07 02:00:00 UTC
2019-05-07 01:00:00 UTC
# air_quality["datetime"].min(), air_quality["datetime"].max()
air_quality.select(
    min=pl.col('datetime').min(),
    max=pl.col('datetime').max()
).row(0)
(datetime.datetime(2019, 5, 7, 1, 0, tzinfo=zoneinfo.ZoneInfo(key='UTC')),
 datetime.datetime(2019, 6, 21, 0, 0, tzinfo=zoneinfo.ZoneInfo(key='UTC')))
# air_quality["datetime"].max() - air_quality["datetime"].min()
air_quality.select(
    span=pl.col('datetime').max() - pl.col('datetime').min()
).item()
datetime.timedelta(days=44, seconds=82800)
# air_quality["month"] = air_quality["datetime"].dt.month
air_quality.with_columns(
    month=pl.col('datetime').dt.month()
)
air_quality.head(5)
shape: (5, 7)
citycountrydatetimelocationparametervalueunit
strstrdatetime[μs, UTC]strstrf64str
"Paris""FR"2019-06-21 00:00:00 UTC"FR04014""no2"20.0"µg/m³"
"Paris""FR"2019-06-20 23:00:00 UTC"FR04014""no2"21.8"µg/m³"
"Paris""FR"2019-06-20 22:00:00 UTC"FR04014""no2"26.5"µg/m³"
"Paris""FR"2019-06-20 21:00:00 UTC"FR04014""no2"24.9"µg/m³"
"Paris""FR"2019-06-20 20:00:00 UTC"FR04014""no2"21.4"µg/m³"
# air_quality.groupby(
#    [air_quality["datetime"].dt.weekday, "location"])["value"].mean()
(
air_quality.group_by(
    pl.col('datetime').dt.weekday().alias('weekday'), 'location'
)
.agg(
    pl.col('value').mean()
)
.sort('weekday', 'location')
)
shape: (21, 3)
weekdaylocationvalue
i8strf64
1"BETR801"27.875
1"FR04014"24.85625
1"London Westminster"23.969697
2"BETR801"22.214286
2"FR04014"30.999359
6"FR04014"25.266154
6"London Westminster"24.977612
7"BETR801"21.896552
7"FR04014"23.274306
7"London Westminster"24.859155
(
air_quality
.group_by(
    pl.col('datetime').dt.hour().alias('hour')
)
.agg(
    pl.col('value').mean()
)
.sort('hour')
.hvplot.bar(x='hour', y='value')
)

Datetime as index#

no_2 = air_quality.pivot(on='location', index='datetime', values='value').sort('datetime')
no_2.head(5)
shape: (5, 4)
datetimeFR04014BETR801London Westminster
datetime[μs, UTC]f64f64f64
2019-05-07 01:00:00 UTC25.050.523.0
2019-05-07 02:00:00 UTC27.745.019.0
2019-05-07 03:00:00 UTC50.4null19.0
2019-05-07 04:00:00 UTC61.9null16.0
2019-05-07 05:00:00 UTC72.4nullnull
# no_2.index.year, no_2.index.weekday
row(
    no_2['datetime'].dt.year(),
    no_2['datetime'].dt.weekday()
)
shape: (1_033,)
datetime
i32
2019
2019
2019
2019
2019
2019
2019
2019
2019
2019
shape: (1_033,)
datetime
i8
2
2
2
2
2
4
4
4
4
5
no_2.filter(
    pl.col('datetime').dt.date().is_between(
        pl.date(2019, 5, 20), 
        pl.date(2019, 5, 21)
    )
).hvplot.line(x='datetime')

Resample a time series to another frequency#

# monthly_max = no_2.resample("ME").max()
monthly_max = (
no_2.group_by(
    pl.col('datetime').dt.date().dt.month_end()
)
.agg(
    pl.all().max()
)
.sort('datetime')
)
monthly_max 
shape: (2, 4)
datetimeFR04014BETR801London Westminster
datef64f64f64
2019-05-3197.074.597.0
2019-06-3084.752.552.0
# no_2.resample("D").mean().plot(style="-o", figsize=(10, 5))
df = (
no_2
.group_by(
    pl.col('datetime').dt.truncate('1d')
)
.agg(
    pl.all().mean()
)
)
df.hvplot.line(x='datetime') * df.hvplot.scatter(x='datetime')

How to manipulate textual data#

titanic = pl.read_csv("data/titanic.csv")
# titanic["Name"].str.lower()
row(
    titanic['Name'].str.to_lowercase(),
    titanic.select(pl.col('Name').str.to_lowercase()).to_series()
)
shape: (891,)
Name
str
"braund, mr. owen harris"
"cumings, mrs. john bradley (fl…
"heikkinen, miss laina"
"futrelle, mrs. jacques heath (…
"allen, mr. william henry"
"montvila, rev. juozas"
"graham, miss margaret edith"
"johnston, miss catherine helen…
"behr, mr. karl howell"
"dooley, mr. patrick"
shape: (891,)
Name
str
"braund, mr. owen harris"
"cumings, mrs. john bradley (fl…
"heikkinen, miss laina"
"futrelle, mrs. jacques heath (…
"allen, mr. william henry"
"montvila, rev. juozas"
"graham, miss margaret edith"
"johnston, miss catherine helen…
"behr, mr. karl howell"
"dooley, mr. patrick"
# titanic["Name"].str.split(",")
row(
    titanic['Name'].str.split(","),
    titanic.select(pl.col('Name').str.split(",")).to_series()
)
shape: (891,)
Name
list[str]
["Braund", " Mr. Owen Harris"]
["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"]
["Heikkinen", " Miss Laina"]
["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"]
["Allen", " Mr. William Henry"]
["Montvila", " Rev. Juozas"]
["Graham", " Miss Margaret Edith"]
["Johnston", " Miss Catherine Helen "Carrie""]
["Behr", " Mr. Karl Howell"]
["Dooley", " Mr. Patrick"]
shape: (891,)
Name
list[str]
["Braund", " Mr. Owen Harris"]
["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"]
["Heikkinen", " Miss Laina"]
["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"]
["Allen", " Mr. William Henry"]
["Montvila", " Rev. Juozas"]
["Graham", " Miss Margaret Edith"]
["Johnston", " Miss Catherine Helen "Carrie""]
["Behr", " Mr. Karl Howell"]
["Dooley", " Mr. Patrick"]
# titanic["Surname"] = titanic["Name"].str.split(",").str.get(0)
titanic = titanic.with_columns(
    Surname=pl.col('Name').str.split(",").list.get(0)
)
titanic['Surname']
shape: (891,)
Surname
str
"Braund"
"Cumings"
"Heikkinen"
"Futrelle"
"Allen"
"Montvila"
"Graham"
"Johnston"
"Behr"
"Dooley"
# titanic["Name"].str.contains("Countess")
row(
    titanic["Name"].str.contains("Countess"),
    titanic.select(pl.col('Name').str.contains("Countess")).to_series()
)
shape: (891,)
Name
bool
false
false
false
false
false
false
false
false
false
false
shape: (891,)
Name
bool
false
false
false
false
false
false
false
false
false
false
# titanic[titanic["Name"].str.contains("Countess")]
titanic.filter(
    pl.col('Name').str.contains("Countess")
)
shape: (1, 13)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedSurname
i64i64i64strstrf64i64i64strf64strstrstr
76011"Rothes, the Countess. of (Lucy…"female"33.000"110152"86.5"B77""S""Rothes"
# titanic["Name"].str.len()
row(
    titanic["Name"].str.len_chars(),
    titanic.select(pl.col('Name').str.len_chars()).to_series()
)
shape: (891,)
Name
u32
23
51
21
44
24
21
27
39
21
19
shape: (891,)
Name
u32
23
51
21
44
24
21
27
39
21
19
# titanic["Name"].str.len().idxmax()
titanic.select(pl.col('Name').str.len_chars().arg_max()).item()
307
# titanic.loc[titanic["Name"].str.len().idxmax(), "Name"]
titanic.row(
    titanic.select(pl.col('Name').str.len_chars().arg_max()).item(), named=True
)['Name']
'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'
titanic.select(
    pl.col('Name').get(pl.col('Name').str.len_chars().arg_max())
).item()
'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'
# titanic["Sex_short"] = titanic["Sex"].replace({"male": "M", "female": "F"})
titanic = titanic.with_columns(
    Sex_short=pl.col('Sex').replace({"male": "M", "female": "F"})
)
titanic['Sex_short']
shape: (891,)
Sex_short
str
"M"
"F"
"F"
"F"
"M"
"M"
"F"
"F"
"M"
"M"