Getting started tutorials

Getting started tutorials#

This notebook contains code that reimplements the examples from the Pandas Getting Started documentation using Polars: https://pandas.pydata.org/docs/getting_started/. I hope this notebook helps you transition from Pandas to Polars more easily. Be mindful that Polars and Pandas have different philosophies, so there are some tasks that are easy to accomplish in Pandas but may require more effort in Polars.

from helper.jupyter import row
import polars as pl
from polars import selectors as cs
import holoviews as hv
import hvplot.polars # need import this to plot with hvplot

What kind of data does pandas handle?#

df = pl.DataFrame(
    {
        "Name": [
            "Braund, Mr. Owen Harris",
            "Allen, Mr. William Henry",
            "Bonnell, Miss. Elizabeth",
        ],
        "Age": [22, 35, 58],
        "Sex": ["male", "male", "female"],
    }
)
df

shape: (3, 3)

Name	Age	Sex
str	i64	str
"Braund, Mr. Owen Harris"	22	"male"
"Allen, Mr. William Henry"	35	"male"
"Bonnell, Miss. Elizabeth"	58	"female"

# df["Age"]
row(
    df['Age'],
    df.get_column('Age')
)

shape: (3,)

Age
i64
22
35
58

shape: (3,)

Age
i64
22
35
58

ages = pl.Series('Age', [22, 35, 58])
ages

shape: (3,)

Age
i64
22
35
58

df['Age'].max()

ages.max()

df.select(pl.col('Age').max()).item()

df.describe()

shape: (9, 4)

statistic	Name	Age	Sex
str	str	f64	str
"count"	"3"	3.0	"3"
"null_count"	"0"	0.0	"0"
"mean"	null	38.333333	null
"std"	null	18.230012	null
"min"	"Allen, Mr. William Henry"	22.0	"female"
"25%"	null	35.0	null
"50%"	null	35.0	null
"75%"	null	58.0	null
"max"	"Braund, Mr. Owen Harris"	58.0	"male"

How do I read and write tabular data?#

titanic = pl.read_csv('data/titanic.csv')
titanic.head(3)

shape: (3, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
1	0	3	"Braund, Mr. Owen Harris"	"male"	22.0	1	0	"A/5 21171"	7.25	null	"S"
2	1	1	"Cumings, Mrs. John Bradley (Fl…	"female"	38.0	1	0	"PC 17599"	71.2833	"C85"	"C"
3	1	3	"Heikkinen, Miss Laina"	"female"	26.0	0	0	"STON/O2. 3101282"	7.925	null	"S"

titanic.tail(3)

shape: (3, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
889	0	3	"Johnston, Miss Catherine Helen…	"female"	null	1	2	"W./C. 6607"	23.45	null	"S"
890	1	1	"Behr, Mr. Karl Howell"	"male"	26.0	0	0	"111369"	30.0	"C148"	"C"
891	0	3	"Dooley, Mr. Patrick"	"male"	32.0	0	0	"370376"	7.75	null	"Q"

titanic.dtypes

[Int64,
 Int64,
 Int64,
 String,
 String,
 Float64,
 Int64,
 Int64,
 String,
 Float64,
 String,
 String]

titanic.schema

Schema([('PassengerId', Int64),
        ('Survived', Int64),
        ('Pclass', Int64),
        ('Name', String),
        ('Sex', String),
        ('Age', Float64),
        ('SibSp', Int64),
        ('Parch', Int64),
        ('Ticket', String),
        ('Fare', Float64),
        ('Cabin', String),
        ('Embarked', String)])

titanic.write_excel('data/titanic.xlsx', worksheet='passengers');

titanic = pl.read_excel('data/titanic.xlsx', sheet_name='passengers')

# titanic.info()
titanic.glimpse()

Rows: 891
Columns: 12
$ PassengerId <i64> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ Survived    <i64> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1
$ Pclass      <i64> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2
$ Name        <str> 'Braund, Mr. Owen Harris', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'Heikkinen, Miss Laina', 'Futrelle, Mrs. Jacques Heath (Lily May Peel)', 'Allen, Mr. William Henry', 'Moran, Mr. James', 'McCarthy, Mr. Timothy J', 'Palsson, Master Gosta Leonard', 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)', 'Nasser, Mrs. Nicholas (Adele Achem)'
$ Sex         <str> 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female'
$ Age         <f64> 22.0, 38.0, 26.0, 35.0, 35.0, None, 54.0, 2.0, 27.0, 14.0
$ SibSp       <i64> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1
$ Parch       <i64> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0
$ Ticket      <str> 'A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450', '330877', '17463', '349909', '347742', '237736'
$ Fare        <f64> 7.25, 71.2833, 7.925, 53.1, 8.05, 8.4583, 51.8625, 21.075, 11.1333, 30.0708
$ Cabin       <str> None, 'C85', None, 'C123', None, None, 'E46', None, None, None
$ Embarked    <str> 'S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C'

titanic.estimated_size()

titanic.null_count()

shape: (1, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
u32	u32	u32	u32	u32	u32	u32	u32	u32	u32	u32	u32
0	0	0	0	0	177	0	0	0	0	687	2

titanic.n_unique()

titanic.select(pl.all().n_unique())

shape: (1, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
u32	u32	u32	u32	u32	u32	u32	u32	u32	u32	u32	u32
891	2	3	891	2	89	7	7	681	248	148	4

How do I select a subset of a DataFrame?#

How do I select specific columns from a DataFrame?#

age = titanic['Age']
age = titanic.get_column('Age') # or this
age.head(5)

shape: (5,)

Age
f64
22.0
38.0
26.0
35.0
35.0

type(age)

polars.series.series.Series

age.shape

(891,)

age_sex = titanic[['Age', 'Sex']]
age_sex = titanic.select('Age', 'Sex') #or this
age_sex.head(5)

shape: (5, 2)

Age	Sex
f64	str
22.0	"male"
38.0	"female"
26.0	"female"
35.0	"female"
35.0	"male"

type(age_sex)

polars.dataframe.frame.DataFrame

age_sex.shape

(891, 2)

How do I filter specific rows from a DataFrame?#

# above_35 = titanic[titanic["Age"] > 35]
above_35 = titanic.filter(pl.col('Age') > 35)
above_35.head(5)

shape: (5, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
2	1	1	"Cumings, Mrs. John Bradley (Fl…	"female"	38.0	1	0	"PC 17599"	71.2833	"C85"	"C"
7	0	1	"McCarthy, Mr. Timothy J"	"male"	54.0	0	0	"17463"	51.8625	"E46"	"S"
12	1	1	"Bonnell, Miss Elizabeth"	"female"	58.0	0	0	"113783"	26.55	"C103"	"S"
14	0	3	"Andersson, Mr. Anders Johan"	"male"	39.0	1	5	"347082"	31.275	null	"S"
16	1	2	"Hewlett, Mrs. (Mary D Kingcome…	"female"	55.0	0	0	"248706"	16.0	null	"S"

# titanic["Age"] > 35
row(
    titanic['Age'] > 35,
    titanic.select(pl.col('Age') > 35),
    titanic.select(pl.col('Age') > 35).to_series()
)

shape: (891,)

Age
bool
false
true
false
false
false
…
false
false
null
false
false

shape: (891, 1)

Age
bool
false
true
false
false
false
…
false
false
null
false
false

shape: (891,)

Age
bool
false
true
false
false
false
…
false
false
null
false
false

above_35.shape

(217, 12)

# class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23 = titanic.filter(pl.col('Pclass').is_in([2, 3]))
class_23.head(5)

shape: (5, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
1	0	3	"Braund, Mr. Owen Harris"	"male"	22.0	1	0	"A/5 21171"	7.25	null	"S"
3	1	3	"Heikkinen, Miss Laina"	"female"	26.0	0	0	"STON/O2. 3101282"	7.925	null	"S"
5	0	3	"Allen, Mr. William Henry"	"male"	35.0	0	0	"373450"	8.05	null	"S"
6	0	3	"Moran, Mr. James"	"male"	null	0	0	"330877"	8.4583	null	"Q"
8	0	3	"Palsson, Master Gosta Leonard"	"male"	2.0	3	1	"349909"	21.075	null	"S"

# class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
class_23 = titanic.filter(
    (pl.col('Pclass') == 2) | (pl.col('Pclass') == 3)
)

class_23 = titanic.filter(
    pl.col('Pclass').eq(2).or_(pl.col('Pclass').eq(3))
)

# age_no_na = titanic[titanic["Age"].notna()]
age_no_na = titanic.filter(pl.col('Age').is_not_null())
age_no_na.head()

shape: (5, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
1	0	3	"Braund, Mr. Owen Harris"	"male"	22.0	1	0	"A/5 21171"	7.25	null	"S"
2	1	1	"Cumings, Mrs. John Bradley (Fl…	"female"	38.0	1	0	"PC 17599"	71.2833	"C85"	"C"
3	1	3	"Heikkinen, Miss Laina"	"female"	26.0	0	0	"STON/O2. 3101282"	7.925	null	"S"
4	1	1	"Futrelle, Mrs. Jacques Heath (…	"female"	35.0	1	0	"113803"	53.1	"C123"	"S"
5	0	3	"Allen, Mr. William Henry"	"male"	35.0	0	0	"373450"	8.05	null	"S"

age_no_na.shape

(714, 12)

How do I select specific rows and columns from a DataFrame?#

# adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
adult_names = (
    titanic
    .filter(pl.col('Age') > 35)
    .select('Name')
    .to_series()
)
adult_names.head(5)

shape: (5,)

Name
str
"Cumings, Mrs. John Bradley (Fl…
"McCarthy, Mr. Timothy J"
"Bonnell, Miss Elizabeth"
"Andersson, Mr. Anders Johan"
"Hewlett, Mrs. (Mary D Kingcome…

# titanic.iloc[9:25, 2:5]
titanic.select(pl.nth(range(2, 5)).slice(9, 25 - 9))

shape: (16, 3)

Pclass	Name	Sex
i64	str	str
2	"Nasser, Mrs. Nicholas (Adele A…	"female"
3	"Sandstrom, Miss Marguerite Rut"	"female"
1	"Bonnell, Miss Elizabeth"	"female"
3	"Saundercock, Mr. William Henry"	"male"
3	"Andersson, Mr. Anders Johan"	"male"
…	…	…
2	"Fynney, Mr. Joseph J"	"male"
2	"Beesley, Mr. Lawrence"	"male"
3	"McGowan, Miss Anna "Annie""	"female"
1	"Sloper, Mr. William Thompson"	"male"
3	"Palsson, Miss Torborg Danira"	"female"

# titanic.iloc[0:3, 3] = "anonymous"
titanic = titanic.with_columns(
    pl.when(pl.int_range(pl.len()) < 3)
    .then(pl.lit('anonymous'))
    .otherwise(pl.nth(3))
    .name.keep()
)
titanic.head(5)

shape: (5, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
1	0	3	"anonymous"	"male"	22.0	1	0	"A/5 21171"	7.25	null	"S"
2	1	1	"anonymous"	"female"	38.0	1	0	"PC 17599"	71.2833	"C85"	"C"
3	1	3	"anonymous"	"female"	26.0	0	0	"STON/O2. 3101282"	7.925	null	"S"
4	1	1	"Futrelle, Mrs. Jacques Heath (…	"female"	35.0	1	0	"113803"	53.1	"C123"	"S"
5	0	3	"Allen, Mr. William Henry"	"male"	35.0	0	0	"373450"	8.05	null	"S"

How do I create plots in pandas?#

air_quality = pl.read_csv("data/air_quality_no2.csv", try_parse_dates=True)
air_quality.head(5)

shape: (5, 4)

datetime	station_antwerp	station_paris	station_london
datetime[μs]	f64	f64	f64
2019-05-07 02:00:00	null	null	23.0
2019-05-07 03:00:00	50.5	25.0	19.0
2019-05-07 04:00:00	45.0	27.7	19.0
2019-05-07 05:00:00	null	50.4	16.0
2019-05-07 06:00:00	null	61.9	null

air_quality.hvplot.line(x='datetime')

air_quality.hvplot.line(x='datetime', y='station_paris')

air_quality.hvplot.scatter(x="station_london", y="station_paris", alpha=0.5)

[
    method_name
    for method_name in dir(air_quality.hvplot)
    if not method_name.startswith("_")
]

['area',
 'bar',
 'barh',
 'bivariate',
 'box',
 'dataset',
 'density',
 'errorbars',
 'explorer',
 'heatmap',
 'hexbin',
 'hist',
 'kde',
 'labels',
 'line',
 'ohlc',
 'paths',
 'points',
 'polygons',
 'scatter',
 'step',
 'table',
 'vectorfield',
 'violin']

air_quality.hvplot.box()

# air_quality.plot.area(figsize=(12, 4), subplots=True)
(
air_quality
.unpivot(index="datetime", variable_name="station", value_name="quality")
.hvplot.area(x='datetime', y='quality', groupby="station")
.opts(height=180)
.layout()
.cols(1)
)

fig = (
air_quality
.unpivot(index="datetime", variable_name="station", value_name="quality")
.with_columns(
    pl.col('quality').fill_null(0)
)
.hvplot.area(x='datetime', y='quality', groupby='station')
.redim(quality=r"NO2 concentration")
.opts(alpha=0.8)
.overlay()
)
fig

hv.save(fig, 'data/no2_concentrations.html')

How to create new columns derived from existing columns#

air_quality = pl.read_csv("data/air_quality_no2.csv", try_parse_dates=True)
air_quality.head(5)

shape: (5, 4)

datetime	station_antwerp	station_paris	station_london
datetime[μs]	f64	f64	f64
2019-05-07 02:00:00	null	null	23.0
2019-05-07 03:00:00	50.5	25.0	19.0
2019-05-07 04:00:00	45.0	27.7	19.0
2019-05-07 05:00:00	null	50.4	16.0
2019-05-07 06:00:00	null	61.9	null

# air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.88
air_quality = air_quality.with_columns(
    london_mg_per_cubic = pl.col("station_london") * 1.882
)

# air_quality["ratio_paris_antwerp"] = (
#    air_quality["station_paris"] / air_quality["station_antwerp"]
# )

air_quality = air_quality.with_columns(
    ratio_paris_antwerp = pl.col("station_paris") / pl.col("station_antwerp")
)

air_quality_renamed = air_quality.rename(
    {
        "station_antwerp": "BETR801",
        "station_paris": "FR04014",
        "station_london": "London Westminster",
    }
)
air_quality_renamed.head(5)

shape: (5, 6)

datetime	BETR801	FR04014	London Westminster	london_mg_per_cubic	ratio_paris_antwerp
datetime[μs]	f64	f64	f64	f64	f64
2019-05-07 02:00:00	null	null	23.0	43.286	null
2019-05-07 03:00:00	50.5	25.0	19.0	35.758	0.49505
2019-05-07 04:00:00	45.0	27.7	19.0	35.758	0.615556
2019-05-07 05:00:00	null	50.4	16.0	30.112	null
2019-05-07 06:00:00	null	61.9	null	null	null

air_quality_renamed = air_quality_renamed.rename(str.lower)
air_quality_renamed.head(5)

shape: (5, 6)

datetime	betr801	fr04014	london westminster	london_mg_per_cubic	ratio_paris_antwerp
datetime[μs]	f64	f64	f64	f64	f64
2019-05-07 02:00:00	null	null	23.0	43.286	null
2019-05-07 03:00:00	50.5	25.0	19.0	35.758	0.49505
2019-05-07 04:00:00	45.0	27.7	19.0	35.758	0.615556
2019-05-07 05:00:00	null	50.4	16.0	30.112	null
2019-05-07 06:00:00	null	61.9	null	null	null

How to calculate summary statistics#

Aggregating statistics#

titanic = pl.read_csv("data/titanic.csv")

(
    titanic['Age'].mean(),
    titanic.select(pl.col('Age').mean()).item() #or this
)

(29.69911764705882, 29.69911764705882)

res1 = titanic[["Age", "Fare"]].median()
res2 = titanic.select(pl.col('Age', 'Fare').median())
row(res1, res2)

shape: (1, 2)

Age	Fare
f64	f64
28.0	14.4542

shape: (1, 2)

Age	Fare
f64	f64
28.0	14.4542

res1.transpose(include_header=True, column_names=['Value'])

shape: (2, 2)

column	Value
str	f64
"Age"	28.0
"Fare"	14.4542

titanic[["Age", "Fare"]].describe()

shape: (9, 3)

statistic	Age	Fare
str	f64	f64
"count"	714.0	891.0
"null_count"	177.0	0.0
"mean"	29.699118	32.204208
"std"	14.526497	49.693429
"min"	0.42	0.0
"25%"	20.0	7.925
"50%"	28.0	14.4542
"75%"	38.0	31.0
"max"	80.0	512.3292

# titanic.agg(
#    {
#        "Age": ["min", "max", "median", "skew"],
#        "Fare": ["min", "max", "median", "mean"],
#    }
# )

age, fare = [pl.col(name) for name in ['Age', 'Fare']]
(
titanic
.select(
    Age=pl.struct(
        min=age.min(),
        max=age.max(),
        median=age.median(),
        skew=age.skew()
    ),
    Fare=pl.struct(
        min=fare.min(),
        max=fare.max(),
        median=fare.median(),
        mean=fare.mean()
    )
)
.transpose(include_header=True, column_names=['agg'])
.unnest('agg')
)

shape: (2, 6)

column	min	max	median	skew	mean
str	f64	f64	f64	f64	f64
"Age"	0.42	80.0	28.0	0.38829	null
"Fare"	0.0	512.3292	14.4542	null	32.204208

def agg(df, items):
    exprs = {
        key: pl.struct(**{agg_name: getattr(pl.col(key), agg_name)() for agg_name in agg_names})
        for key, agg_names in items.items()
    }
    return df.select(**exprs).transpose(include_header=True, column_names=["agg"]).unnest("agg")


agg(
    titanic,
    {
        "Age": ["min", "max", "median", "skew"],
        "Fare": ["min", "max", "median", "mean"],
    },
)

shape: (2, 6)

column	min	max	median	skew	mean
str	f64	f64	f64	f64	f64
"Age"	0.42	80.0	28.0	0.38829	null
"Fare"	0.0	512.3292	14.4542	null	32.204208

Aggregating statistics grouped by category#

# titanic[["Sex", "Age"]].groupby("Sex").mean()
titanic.group_by('Sex').agg(pl.col('Age').mean())

shape: (2, 2)

Sex	Age
str	f64
"female"	27.915709
"male"	30.726645

# titanic.groupby("Sex").mean(numeric_only=True)

from polars import selectors as cs
titanic.group_by('Sex').agg(cs.numeric().mean())

shape: (2, 8)

Sex	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
str	f64	f64	f64	f64	f64	f64	f64
"female"	431.028662	0.742038	2.159236	27.915709	0.694268	0.649682	44.479818
"male"	454.147314	0.188908	2.389948	30.726645	0.429809	0.235702	25.523893

# titanic.groupby(["Sex", "Pclass"])["Fare"].mean()
(
titanic
.group_by('Sex', 'Pclass')
.agg(pl.col('Fare').mean())
.sort('Sex', 'Pclass')
)

shape: (6, 3)

Sex	Pclass	Fare
str	i64	f64
"female"	1	106.125798
"female"	2	21.970121
"female"	3	16.11881
"male"	1	67.226127
"male"	2	19.741782
"male"	3	12.661633

Count number of records by category#

# titanic["Pclass"].value_counts()
res1 = titanic['Pclass'].value_counts().sort('count', descending=True)
res2 = (
titanic
.select(
    pl.col('Pclass')
      .value_counts()
      .struct.unnest()
)
.sort('count', descending=True)
)
row(res1, res2)

shape: (3, 2)

Pclass	count
i64	u32
3	491
1	216
2	184

shape: (3, 2)

Pclass	count
i64	u32
3	491
1	216
2	184

# titanic.groupby("Pclass")["Pclass"].count()
titanic.group_by('Pclass').agg(count=pl.len())

shape: (3, 2)

Pclass	count
i64	u32
3	491
2	184
1	216

How to reshape the layout of tables#

titanic = pl.read_csv("data/titanic.csv")
air_quality = pl.read_csv("data/air_quality_long.csv", try_parse_dates=True)

Sort table rows#

titanic.sort('Age', nulls_last=True).head(5)

shape: (5, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
804	1	3	"Thomas, Master Assad Alexander"	"male"	0.42	0	1	"2625"	8.5167	null	"C"
756	1	2	"Hamalainen, Master Viljo"	"male"	0.67	1	1	"250649"	14.5	null	"S"
470	1	3	"Baclini, Miss Helene Barbara"	"female"	0.75	2	1	"2666"	19.2583	null	"C"
645	1	3	"Baclini, Miss Eugenie"	"female"	0.75	2	1	"2666"	19.2583	null	"C"
79	1	2	"Caldwell, Master Alden Gates"	"male"	0.83	0	2	"248738"	29.0	null	"S"

titanic.sort('Pclass', 'Age', descending=True, nulls_last=True).head()

shape: (5, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
852	0	3	"Svensson, Mr. Johan"	"male"	74.0	0	0	"347060"	7.775	null	"S"
117	0	3	"Connors, Mr. Patrick"	"male"	70.5	0	0	"370369"	7.75	null	"Q"
281	0	3	"Duane, Mr. Frank"	"male"	65.0	0	0	"336439"	7.75	null	"Q"
484	1	3	"Turkula, Mrs. (Hedwig)"	"female"	63.0	0	0	"4134"	9.5875	null	"S"
327	0	3	"Nysveen, Mr. Johan Hansen"	"male"	61.0	0	0	"345364"	6.2375	null	"S"

Long to wide table format#

no2 = air_quality.filter(pl.col('parameter') == 'no2')

# no2_subset = no2.sort_index().groupby(["location"]).head(2)
no2_subset = (
no2
.sort('date.utc')
.group_by('location')
.agg(pl.all().head(2))
.explode(pl.exclude('location'))
)
no2_subset

shape: (6, 7)

location	city	country	date.utc	parameter	value	unit
str	str	str	datetime[μs, UTC]	str	f64	str
"BETR801"	"Antwerpen"	"BE"	2019-04-09 01:00:00 UTC	"no2"	22.5	"µg/m³"
"BETR801"	"Antwerpen"	"BE"	2019-04-09 02:00:00 UTC	"no2"	53.5	"µg/m³"
"FR04014"	"Paris"	"FR"	2019-04-09 01:00:00 UTC	"no2"	24.4	"µg/m³"
"FR04014"	"Paris"	"FR"	2019-04-09 02:00:00 UTC	"no2"	27.4	"µg/m³"
"London Westminster"	"London"	"GB"	2019-04-09 02:00:00 UTC	"no2"	67.0	"µg/m³"
"London Westminster"	"London"	"GB"	2019-04-09 03:00:00 UTC	"no2"	67.0	"µg/m³"

no2_subset.pivot(on="location", index='date.utc', values='value')

shape: (3, 4)

date.utc	BETR801	FR04014	London Westminster
datetime[μs, UTC]	f64	f64	f64
2019-04-09 01:00:00 UTC	22.5	24.4	null
2019-04-09 02:00:00 UTC	53.5	27.4	67.0
2019-04-09 03:00:00 UTC	null	null	67.0

no2.head(5)

shape: (5, 7)

city	country	date.utc	location	parameter	value	unit
str	str	datetime[μs, UTC]	str	str	f64	str
"Paris"	"FR"	2019-06-21 00:00:00 UTC	"FR04014"	"no2"	20.0	"µg/m³"
"Paris"	"FR"	2019-06-20 23:00:00 UTC	"FR04014"	"no2"	21.8	"µg/m³"
"Paris"	"FR"	2019-06-20 22:00:00 UTC	"FR04014"	"no2"	26.5	"µg/m³"
"Paris"	"FR"	2019-06-20 21:00:00 UTC	"FR04014"	"no2"	24.9	"µg/m³"
"Paris"	"FR"	2019-06-20 20:00:00 UTC	"FR04014"	"no2"	21.4	"µg/m³"

(
no2
.sort('date.utc')
.pivot(on='location', index='date.utc', values='value')
.hvplot.line(x="date.utc")
)

Pivot table#

air_quality.pivot(
    on='parameter', index='location', values='value', aggregate_function='mean'
)

shape: (3, 3)

location	pm25	no2
str	f64	f64
"BETR801"	23.169492	26.95092
"London Westminster"	13.443568	29.74005
"FR04014"	null	29.374284

air_quality.pivot(
    on='parameter', index='location', values='value', aggregate_function='mean'
)

shape: (3, 3)

location	pm25	no2
str	f64	f64
"BETR801"	23.169492	26.95092
"London Westminster"	13.443568	29.74005
"FR04014"	null	29.374284

# air_quality.pivot_table(
#     values="value",
#     index="location",
#     columns="parameter",
#     aggfunc="mean",
#     margins=True,
# )

def pivot_with_margins(df, on, index, expr):
    all_on = pl.lit('All').alias(on)
    all_index = pl.lit('All').alias(index)

    return (
        pl.concat([
            df.group_by(index, on).agg(expr),
            df.group_by(index).agg(all_on, expr),
            df.group_by(on).agg(all_index, expr),
            df.select(all_on, all_index, expr),
        ], how='diagonal')
         .pivot(on=on, index=index)
    )

pivot_with_margins(
    air_quality, 
    on='parameter', 
    index='location', 
    expr=pl.col('value').mean()
)

shape: (4, 4)

location	no2	pm25	All
str	f64	f64	f64
"FR04014"	29.374284	null	29.374284
"BETR801"	26.95092	23.169492	24.982353
"London Westminster"	29.74005	13.443568	21.491708
"All"	29.430316	14.386849	24.222743

Wide to long format#

no2_pivoted = no2.pivot(on='location', index="date.utc", values="value").sort('date.utc')
no2_pivoted.head(5)

shape: (5, 4)

date.utc	FR04014	BETR801	London Westminster
datetime[μs, UTC]	f64	f64	f64
2019-04-09 01:00:00 UTC	24.4	22.5	null
2019-04-09 02:00:00 UTC	27.4	53.5	67.0
2019-04-09 03:00:00 UTC	34.2	54.5	67.0
2019-04-09 04:00:00 UTC	48.5	34.5	41.0
2019-04-09 05:00:00 UTC	59.5	46.5	41.0

# no_2 = no2_pivoted.melt(id_vars="date.utc")
no_2 = no2_pivoted.unpivot(index='date.utc', variable_name='location')
no_2.head(5)

shape: (5, 3)

date.utc	location	value
datetime[μs, UTC]	str	f64
2019-04-09 01:00:00 UTC	"FR04014"	24.4
2019-04-09 02:00:00 UTC	"FR04014"	27.4
2019-04-09 03:00:00 UTC	"FR04014"	34.2
2019-04-09 04:00:00 UTC	"FR04014"	48.5
2019-04-09 05:00:00 UTC	"FR04014"	59.5

no_2 = no2_pivoted.unpivot(
    on=["BETR801", "FR04014", "London Westminster"],
    index='date.utc',
    variable_name='id_location',
    value_name='NO_2'
)
no_2.head(5)

shape: (5, 3)

date.utc	id_location	NO_2
datetime[μs, UTC]	str	f64
2019-04-09 01:00:00 UTC	"BETR801"	22.5
2019-04-09 02:00:00 UTC	"BETR801"	53.5
2019-04-09 03:00:00 UTC	"BETR801"	54.5
2019-04-09 04:00:00 UTC	"BETR801"	34.5
2019-04-09 05:00:00 UTC	"BETR801"	46.5

How to combine data from multiple tables#

air_quality_no2 = pl.read_csv("data/air_quality_no2_long.csv", try_parse_dates=True)
air_quality_no2 = air_quality_no2.select("date.utc", "location", "parameter", "value")

air_quality_pm25 = pl.read_csv("data/air_quality_pm25_long.csv", try_parse_dates=True)
air_quality_pm25 = air_quality_pm25.select("date.utc", "location", "parameter", "value")

Concatenating objects#

air_quality = pl.concat([air_quality_pm25, air_quality_no2], how='vertical')
air_quality

shape: (3_178, 4)

date.utc	location	parameter	value
datetime[μs, UTC]	str	str	f64
2019-06-18 06:00:00 UTC	"BETR801"	"pm25"	18.0
2019-06-17 08:00:00 UTC	"BETR801"	"pm25"	6.5
2019-06-17 07:00:00 UTC	"BETR801"	"pm25"	18.5
2019-06-17 06:00:00 UTC	"BETR801"	"pm25"	16.0
2019-06-17 05:00:00 UTC	"BETR801"	"pm25"	7.5
…	…	…	…
2019-05-07 06:00:00 UTC	"London Westminster"	"no2"	26.0
2019-05-07 04:00:00 UTC	"London Westminster"	"no2"	16.0
2019-05-07 03:00:00 UTC	"London Westminster"	"no2"	19.0
2019-05-07 02:00:00 UTC	"London Westminster"	"no2"	19.0
2019-05-07 01:00:00 UTC	"London Westminster"	"no2"	23.0

print('Shape of the ``air_quality_pm25`` table: ', air_quality_pm25.shape)
print('Shape of the ``air_quality_no2`` table: ', air_quality_no2.shape)
print('Shape of the resulting ``air_quality`` table: ', air_quality.shape)

Shape of the ``air_quality_pm25`` table:  (1110, 4)
Shape of the ``air_quality_no2`` table:  (2068, 4)
Shape of the resulting ``air_quality`` table:  (3178, 4)

air_quality = air_quality.sort("date.utc")
air_quality.head()

shape: (5, 4)

date.utc	location	parameter	value
datetime[μs, UTC]	str	str	f64
2019-05-07 01:00:00 UTC	"BETR801"	"pm25"	12.5
2019-05-07 01:00:00 UTC	"London Westminster"	"pm25"	8.0
2019-05-07 01:00:00 UTC	"FR04014"	"no2"	25.0
2019-05-07 01:00:00 UTC	"BETR801"	"no2"	50.5
2019-05-07 01:00:00 UTC	"London Westminster"	"no2"	23.0

# air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"])
air_quality_ = pl.concat([
    air_quality_pm25.with_columns(pl.lit('PM25').alias('key')),
    air_quality_no2.with_columns(pl.lit('NO2').alias('key'))
]).select('key', pl.exclude('key'))
air_quality_

shape: (3_178, 5)

key	date.utc	location	parameter	value
str	datetime[μs, UTC]	str	str	f64
"PM25"	2019-06-18 06:00:00 UTC	"BETR801"	"pm25"	18.0
"PM25"	2019-06-17 08:00:00 UTC	"BETR801"	"pm25"	6.5
"PM25"	2019-06-17 07:00:00 UTC	"BETR801"	"pm25"	18.5
"PM25"	2019-06-17 06:00:00 UTC	"BETR801"	"pm25"	16.0
"PM25"	2019-06-17 05:00:00 UTC	"BETR801"	"pm25"	7.5
…	…	…	…	…
"NO2"	2019-05-07 06:00:00 UTC	"London Westminster"	"no2"	26.0
"NO2"	2019-05-07 04:00:00 UTC	"London Westminster"	"no2"	16.0
"NO2"	2019-05-07 03:00:00 UTC	"London Westminster"	"no2"	19.0
"NO2"	2019-05-07 02:00:00 UTC	"London Westminster"	"no2"	19.0
"NO2"	2019-05-07 01:00:00 UTC	"London Westminster"	"no2"	23.0

def concat_with_keys(dfs, keys, key_column_name='key'):
    dfs = [df.with_columns(pl.lit(key).alias(key_column_name)) 
           for df, key in zip(dfs, keys)]
    return pl.concat(dfs, how='vertical').select(key_column_name, pl.exclude(key_column_name))

air_quality_ = concat_with_keys([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"])
air_quality_

shape: (3_178, 5)

key	date.utc	location	parameter	value
str	datetime[μs, UTC]	str	str	f64
"PM25"	2019-06-18 06:00:00 UTC	"BETR801"	"pm25"	18.0
"PM25"	2019-06-17 08:00:00 UTC	"BETR801"	"pm25"	6.5
"PM25"	2019-06-17 07:00:00 UTC	"BETR801"	"pm25"	18.5
"PM25"	2019-06-17 06:00:00 UTC	"BETR801"	"pm25"	16.0
"PM25"	2019-06-17 05:00:00 UTC	"BETR801"	"pm25"	7.5
…	…	…	…	…
"NO2"	2019-05-07 06:00:00 UTC	"London Westminster"	"no2"	26.0
"NO2"	2019-05-07 04:00:00 UTC	"London Westminster"	"no2"	16.0
"NO2"	2019-05-07 03:00:00 UTC	"London Westminster"	"no2"	19.0
"NO2"	2019-05-07 02:00:00 UTC	"London Westminster"	"no2"	19.0
"NO2"	2019-05-07 01:00:00 UTC	"London Westminster"	"no2"	23.0

Join tables using a common identifier#

stations_coord = pl.read_csv("data/air_quality_stations.csv")
stations_coord.head(5)

shape: (5, 3)

location	coordinates.latitude	coordinates.longitude
str	f64	f64
"BELAL01"	51.23619	4.38522
"BELHB23"	51.1703	4.341
"BELLD01"	51.10998	5.00486
"BELLD02"	51.12038	5.02155
"BELR833"	51.32766	4.36226

air_quality = air_quality.join(stations_coord, on='location', how='left')
air_quality.head(5)

shape: (5, 6)

date.utc	location	parameter	value	coordinates.latitude	coordinates.longitude
datetime[μs, UTC]	str	str	f64	f64	f64
2019-05-07 01:00:00 UTC	"BETR801"	"pm25"	12.5	51.20966	4.43182
2019-05-07 01:00:00 UTC	"London Westminster"	"pm25"	8.0	51.49467	-0.13193
2019-05-07 01:00:00 UTC	"FR04014"	"no2"	25.0	48.83724	2.3939
2019-05-07 01:00:00 UTC	"FR04014"	"no2"	25.0	48.83722	2.3939
2019-05-07 01:00:00 UTC	"BETR801"	"no2"	50.5	51.20966	4.43182

air_quality_parameters = pl.read_csv("data/air_quality_parameters.csv")
air_quality_parameters.head(5)

shape: (5, 3)

id	description	name
str	str	str
"bc"	"Black Carbon"	"BC"
"co"	"Carbon Monoxide"	"CO"
"no2"	"Nitrogen Dioxide"	"NO2"
"o3"	"Ozone"	"O3"
"pm10"	"Particulate matter less than 1…	"PM10"

# air_quality = pd.merge(air_quality, stations_coord, how="left", on="location")
air_quality = air_quality.join(air_quality_parameters, left_on='parameter', right_on='id')
air_quality.head()

shape: (5, 8)

date.utc	location	parameter	value	coordinates.latitude	coordinates.longitude	description	name
datetime[μs, UTC]	str	str	f64	f64	f64	str	str
2019-05-07 01:00:00 UTC	"BETR801"	"pm25"	12.5	51.20966	4.43182	"Particulate matter less than 2…	"PM2.5"
2019-05-07 01:00:00 UTC	"London Westminster"	"pm25"	8.0	51.49467	-0.13193	"Particulate matter less than 2…	"PM2.5"
2019-05-07 01:00:00 UTC	"FR04014"	"no2"	25.0	48.83724	2.3939	"Nitrogen Dioxide"	"NO2"
2019-05-07 01:00:00 UTC	"FR04014"	"no2"	25.0	48.83722	2.3939	"Nitrogen Dioxide"	"NO2"
2019-05-07 01:00:00 UTC	"BETR801"	"no2"	50.5	51.20966	4.43182	"Nitrogen Dioxide"	"NO2"

How to handle time series data with ease#

air_quality = pl.read_csv("data/air_quality_no2_long.csv")
air_quality = air_quality.rename({"date.utc": "datetime"})
air_quality.head()

shape: (5, 7)

city	country	datetime	location	parameter	value	unit
str	str	str	str	str	f64	str
"Paris"	"FR"	"2019-06-21 00:00:00+00:00"	"FR04014"	"no2"	20.0	"µg/m³"
"Paris"	"FR"	"2019-06-20 23:00:00+00:00"	"FR04014"	"no2"	21.8	"µg/m³"
"Paris"	"FR"	"2019-06-20 22:00:00+00:00"	"FR04014"	"no2"	26.5	"µg/m³"
"Paris"	"FR"	"2019-06-20 21:00:00+00:00"	"FR04014"	"no2"	24.9	"µg/m³"
"Paris"	"FR"	"2019-06-20 20:00:00+00:00"	"FR04014"	"no2"	21.4	"µg/m³"

row(
    air_quality['city'].unique(),
    air_quality.select(pl.col('city').unique())
)

shape: (3,)

city
str
"Antwerpen"
"London"
"Paris"

shape: (3, 1)

city
str
"Antwerpen"
"London"
"Paris"

Using pandas datetime properties#

# air_quality["datetime"] = pd.to_datetime(air_quality["datetime"])
air_quality = air_quality.with_columns(
    pl.col('datetime').str.to_datetime()
)
air_quality['datetime']

shape: (2_068,)

datetime
datetime[μs, UTC]
2019-06-21 00:00:00 UTC
2019-06-20 23:00:00 UTC
2019-06-20 22:00:00 UTC
2019-06-20 21:00:00 UTC
2019-06-20 20:00:00 UTC
…
2019-05-07 06:00:00 UTC
2019-05-07 04:00:00 UTC
2019-05-07 03:00:00 UTC
2019-05-07 02:00:00 UTC
2019-05-07 01:00:00 UTC

# air_quality["datetime"].min(), air_quality["datetime"].max()
air_quality.select(
    min=pl.col('datetime').min(),
    max=pl.col('datetime').max()
).row(0)

(datetime.datetime(2019, 5, 7, 1, 0, tzinfo=zoneinfo.ZoneInfo(key='UTC')),
 datetime.datetime(2019, 6, 21, 0, 0, tzinfo=zoneinfo.ZoneInfo(key='UTC')))

# air_quality["datetime"].max() - air_quality["datetime"].min()
air_quality.select(
    span=pl.col('datetime').max() - pl.col('datetime').min()
).item()

datetime.timedelta(days=44, seconds=82800)

# air_quality["month"] = air_quality["datetime"].dt.month
air_quality.with_columns(
    month=pl.col('datetime').dt.month()
)
air_quality.head(5)

shape: (5, 7)

city	country	datetime	location	parameter	value	unit
str	str	datetime[μs, UTC]	str	str	f64	str
"Paris"	"FR"	2019-06-21 00:00:00 UTC	"FR04014"	"no2"	20.0	"µg/m³"
"Paris"	"FR"	2019-06-20 23:00:00 UTC	"FR04014"	"no2"	21.8	"µg/m³"
"Paris"	"FR"	2019-06-20 22:00:00 UTC	"FR04014"	"no2"	26.5	"µg/m³"
"Paris"	"FR"	2019-06-20 21:00:00 UTC	"FR04014"	"no2"	24.9	"µg/m³"
"Paris"	"FR"	2019-06-20 20:00:00 UTC	"FR04014"	"no2"	21.4	"µg/m³"

# air_quality.groupby(
#    [air_quality["datetime"].dt.weekday, "location"])["value"].mean()
(
air_quality.group_by(
    pl.col('datetime').dt.weekday().alias('weekday'), 'location'
)
.agg(
    pl.col('value').mean()
)
.sort('weekday', 'location')
)

shape: (21, 3)

weekday	location	value
i8	str	f64
1	"BETR801"	27.875
1	"FR04014"	24.85625
1	"London Westminster"	23.969697
2	"BETR801"	22.214286
2	"FR04014"	30.999359
…	…	…
6	"FR04014"	25.266154
6	"London Westminster"	24.977612
7	"BETR801"	21.896552
7	"FR04014"	23.274306
7	"London Westminster"	24.859155

(
air_quality
.group_by(
    pl.col('datetime').dt.hour().alias('hour')
)
.agg(
    pl.col('value').mean()
)
.sort('hour')
.hvplot.bar(x='hour', y='value')
)

Datetime as index#

no_2 = air_quality.pivot(on='location', index='datetime', values='value').sort('datetime')
no_2.head(5)

shape: (5, 4)

datetime	FR04014	BETR801	London Westminster
datetime[μs, UTC]	f64	f64	f64
2019-05-07 01:00:00 UTC	25.0	50.5	23.0
2019-05-07 02:00:00 UTC	27.7	45.0	19.0
2019-05-07 03:00:00 UTC	50.4	null	19.0
2019-05-07 04:00:00 UTC	61.9	null	16.0
2019-05-07 05:00:00 UTC	72.4	null	null

# no_2.index.year, no_2.index.weekday
row(
    no_2['datetime'].dt.year(),
    no_2['datetime'].dt.weekday()
)

shape: (1_033,)

datetime
i32
2019
2019
2019
2019
2019
…
2019
2019
2019
2019
2019

shape: (1_033,)

datetime
i8
2
2
2
2
2
…
4
4
4
4
5

no_2.filter(
    pl.col('datetime').dt.date().is_between(
        pl.date(2019, 5, 20), 
        pl.date(2019, 5, 21)
    )
).hvplot.line(x='datetime')

Resample a time series to another frequency#

# monthly_max = no_2.resample("ME").max()
monthly_max = (
no_2.group_by(
    pl.col('datetime').dt.date().dt.month_end()
)
.agg(
    pl.all().max()
)
.sort('datetime')
)
monthly_max 

shape: (2, 4)

datetime	FR04014	BETR801	London Westminster
date	f64	f64	f64
2019-05-31	97.0	74.5	97.0
2019-06-30	84.7	52.5	52.0

# no_2.resample("D").mean().plot(style="-o", figsize=(10, 5))
df = (
no_2
.group_by(
    pl.col('datetime').dt.truncate('1d')
)
.agg(
    pl.all().mean()
)
)
df.hvplot.line(x='datetime') * df.hvplot.scatter(x='datetime')

How to manipulate textual data#

titanic = pl.read_csv("data/titanic.csv")

# titanic["Name"].str.lower()
row(
    titanic['Name'].str.to_lowercase(),
    titanic.select(pl.col('Name').str.to_lowercase()).to_series()
)

shape: (891,)

Name
str
"braund, mr. owen harris"
"cumings, mrs. john bradley (fl…
"heikkinen, miss laina"
"futrelle, mrs. jacques heath (…
"allen, mr. william henry"
…
"montvila, rev. juozas"
"graham, miss margaret edith"
"johnston, miss catherine helen…
"behr, mr. karl howell"
"dooley, mr. patrick"

shape: (891,)

Name
str
"braund, mr. owen harris"
"cumings, mrs. john bradley (fl…
"heikkinen, miss laina"
"futrelle, mrs. jacques heath (…
"allen, mr. william henry"
…
"montvila, rev. juozas"
"graham, miss margaret edith"
"johnston, miss catherine helen…
"behr, mr. karl howell"
"dooley, mr. patrick"

# titanic["Name"].str.split(",")
row(
    titanic['Name'].str.split(","),
    titanic.select(pl.col('Name').str.split(",")).to_series()
)

shape: (891,)

Name
list[str]
["Braund", " Mr. Owen Harris"]
["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"]
["Heikkinen", " Miss Laina"]
["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"]
["Allen", " Mr. William Henry"]
…
["Montvila", " Rev. Juozas"]
["Graham", " Miss Margaret Edith"]
["Johnston", " Miss Catherine Helen "Carrie""]
["Behr", " Mr. Karl Howell"]
["Dooley", " Mr. Patrick"]

shape: (891,)

Name
list[str]
["Braund", " Mr. Owen Harris"]
["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"]
["Heikkinen", " Miss Laina"]
["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"]
["Allen", " Mr. William Henry"]
…
["Montvila", " Rev. Juozas"]
["Graham", " Miss Margaret Edith"]
["Johnston", " Miss Catherine Helen "Carrie""]
["Behr", " Mr. Karl Howell"]
["Dooley", " Mr. Patrick"]

# titanic["Surname"] = titanic["Name"].str.split(",").str.get(0)
titanic = titanic.with_columns(
    Surname=pl.col('Name').str.split(",").list.get(0)
)
titanic['Surname']

shape: (891,)

Surname
str
"Braund"
"Cumings"
"Heikkinen"
"Futrelle"
"Allen"
…
"Montvila"
"Graham"
"Johnston"
"Behr"
"Dooley"

# titanic["Name"].str.contains("Countess")
row(
    titanic["Name"].str.contains("Countess"),
    titanic.select(pl.col('Name').str.contains("Countess")).to_series()
)

shape: (891,)

Name
bool
false
false
false
false
false
…
false
false
false
false
false

shape: (891,)

Name
bool
false
false
false
false
false
…
false
false
false
false
false

# titanic[titanic["Name"].str.contains("Countess")]
titanic.filter(
    pl.col('Name').str.contains("Countess")
)

shape: (1, 13)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Surname
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str	str
760	1	1	"Rothes, the Countess. of (Lucy…	"female"	33.0	0	0	"110152"	86.5	"B77"	"S"	"Rothes"

# titanic["Name"].str.len()
row(
    titanic["Name"].str.len_chars(),
    titanic.select(pl.col('Name').str.len_chars()).to_series()
)

shape: (891,)

Name
u32
23
51
21
44
24
…
21
27
39
21
19

shape: (891,)

Name
u32
23
51
21
44
24
…
21
27
39
21
19

# titanic["Name"].str.len().idxmax()
titanic.select(pl.col('Name').str.len_chars().arg_max()).item()

# titanic.loc[titanic["Name"].str.len().idxmax(), "Name"]
titanic.row(
    titanic.select(pl.col('Name').str.len_chars().arg_max()).item(), named=True
)['Name']

'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'

titanic.select(
    pl.col('Name').get(pl.col('Name').str.len_chars().arg_max())
).item()

'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'

# titanic["Sex_short"] = titanic["Sex"].replace({"male": "M", "female": "F"})
titanic = titanic.with_columns(
    Sex_short=pl.col('Sex').replace({"male": "M", "female": "F"})
)
titanic['Sex_short']

shape: (891,)

Sex_short
str
"M"
"F"
"F"
"F"
"M"
…
"M"
"F"
"F"
"M"
"M"

Getting started tutorials

Contents

Getting started tutorials#

What kind of data does pandas handle?#

How do I read and write tabular data?#

How do I select a subset of a DataFrame?#

How do I select specific columns from a DataFrame?#

How do I filter specific rows from a DataFrame?#

How do I select specific rows and columns from a DataFrame?#

How do I create plots in pandas?#

How to create new columns derived from existing columns#

How to calculate summary statistics#

Aggregating statistics#

Aggregating statistics grouped by category#

Count number of records by category#

How to reshape the layout of tables#

Sort table rows#

Long to wide table format#

Pivot table#

Wide to long format#

How to combine data from multiple tables#

Concatenating objects#

Join tables using a common identifier#

How to handle time series data with ease#

Using pandas datetime properties#

Datetime as index#

Resample a time series to another frequency#

How to manipulate textual data#