{ "cells": [ { "cell_type": "markdown", "id": "03b217d4-8434-4396-8f8e-574dc5bae30d", "metadata": {}, "source": [ "# Getting started tutorials" ] }, { "cell_type": "markdown", "id": "45ef48ea-06e9-4087-9c2e-7c66535e95c1", "metadata": {}, "source": [ "This notebook contains code that reimplements the examples from the Pandas Getting Started documentation using Polars: https://pandas.pydata.org/docs/getting_started/. I hope this notebook helps you transition from Pandas to Polars more easily. Be mindful that Polars and Pandas have different philosophies, so there are some tasks that are easy to accomplish in Pandas but may require more effort in Polars." ] }, { "cell_type": "code", "execution_count": 36, "id": "ea8eea19-c88c-47c8-bc0f-0424f832e2b3", "metadata": {}, "outputs": [], "source": [ "from helper.jupyter import row\n", "import polars as pl\n", "from polars import selectors as cs\n", "import holoviews as hv\n", "import hvplot.polars # need import this to plot with hvplot" ] }, { "cell_type": "markdown", "id": "45d81bbd-ef21-46ec-9b71-8f54a01a7bfd", "metadata": {}, "source": [ "## What kind of data does pandas handle?" ] }, { "cell_type": "code", "execution_count": 37, "id": "ae87ff76-cd12-4e47-b2b3-c447968757d7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3, 3)
NameAgeSex
stri64str
"Braund, Mr. Owen Harris"22"male"
"Allen, Mr. William Henry"35"male"
"Bonnell, Miss. Elizabeth"58"female"
" ], "text/plain": [ "shape: (3, 3)\n", "┌──────────────────────────┬─────┬────────┐\n", "│ Name ┆ Age ┆ Sex │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i64 ┆ str │\n", "╞══════════════════════════╪═════╪════════╡\n", "│ Braund, Mr. Owen Harris ┆ 22 ┆ male │\n", "│ Allen, Mr. William Henry ┆ 35 ┆ male │\n", "│ Bonnell, Miss. Elizabeth ┆ 58 ┆ female │\n", "└──────────────────────────┴─────┴────────┘" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pl.DataFrame(\n", " {\n", " \"Name\": [\n", " \"Braund, Mr. Owen Harris\",\n", " \"Allen, Mr. William Henry\",\n", " \"Bonnell, Miss. Elizabeth\",\n", " ],\n", " \"Age\": [22, 35, 58],\n", " \"Sex\": [\"male\", \"male\", \"female\"],\n", " }\n", ")\n", "df" ] }, { "cell_type": "code", "execution_count": 38, "id": "f8232f48-cf91-45c7-b075-832afdb7b370", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3,)
Age
i64
22
35
58
\n", "shape: (3,)
Age
i64
22
35
58
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# df[\"Age\"]\n", "row(\n", " df['Age'],\n", " df.get_column('Age')\n", ")" ] }, { "cell_type": "code", "execution_count": 39, "id": "5d5efe7b-ba3d-41c9-8566-79c1052b3949", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3,)
Age
i64
22
35
58
" ], "text/plain": [ "shape: (3,)\n", "Series: 'Age' [i64]\n", "[\n", "\t22\n", "\t35\n", "\t58\n", "]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ages = pl.Series('Age', [22, 35, 58])\n", "ages" ] }, { "cell_type": "code", "execution_count": 40, "id": "74c7d15e-dbb0-485f-8d3e-c768116ae865", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "58" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Age'].max()" ] }, { "cell_type": "code", "execution_count": 41, "id": "05de0848-d8dd-4591-adc0-25e741a4e067", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "58" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ages.max()" ] }, { "cell_type": "code", "execution_count": 42, "id": "d4d21658-0c50-408f-bf49-60fa0545d3f3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "58" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.select(pl.col('Age').max()).item()" ] }, { "cell_type": "code", "execution_count": 43, "id": "835fd1bc-d666-4a29-be17-51d6ceabcf8a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (9, 4)
statisticNameAgeSex
strstrf64str
"count""3"3.0"3"
"null_count""0"0.0"0"
"mean"null38.333333null
"std"null18.230012null
"min""Allen, Mr. William Henry"22.0"female"
"25%"null35.0null
"50%"null35.0null
"75%"null58.0null
"max""Braund, Mr. Owen Harris"58.0"male"
" ], "text/plain": [ "shape: (9, 4)\n", "┌────────────┬──────────────────────────┬───────────┬────────┐\n", "│ statistic ┆ Name ┆ Age ┆ Sex │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ f64 ┆ str │\n", "╞════════════╪══════════════════════════╪═══════════╪════════╡\n", "│ count ┆ 3 ┆ 3.0 ┆ 3 │\n", "│ null_count ┆ 0 ┆ 0.0 ┆ 0 │\n", "│ mean ┆ null ┆ 38.333333 ┆ null │\n", "│ std ┆ null ┆ 18.230012 ┆ null │\n", "│ min ┆ Allen, Mr. William Henry ┆ 22.0 ┆ female │\n", "│ 25% ┆ null ┆ 35.0 ┆ null │\n", "│ 50% ┆ null ┆ 35.0 ┆ null │\n", "│ 75% ┆ null ┆ 58.0 ┆ null │\n", "│ max ┆ Braund, Mr. Owen Harris ┆ 58.0 ┆ male │\n", "└────────────┴──────────────────────────┴───────────┴────────┘" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "id": "f72a6717-aca0-4a13-a5e3-41a1db6489e6", "metadata": {}, "source": [ "## How do I read and write tabular data?" ] }, { "cell_type": "code", "execution_count": 44, "id": "573ddedd-978d-47f4-b688-7e1d345eeb61", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"Braund, Mr. Owen Harris""male"22.010"A/5 21171"7.25null"S"
211"Cumings, Mrs. John Bradley (Fl…"female"38.010"PC 17599"71.2833"C85""C"
313"Heikkinen, Miss Laina""female"26.000"STON/O2. 3101282"7.925null"S"
" ], "text/plain": [ "shape: (3, 12)\n", "┌─────────────┬──────────┬────────┬────────────────────────────┬───┬──────────────────┬─────────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 ┆ str ┆ ┆ str ┆ f64 ┆ str ┆ str │\n", "╞═════════════╪══════════╪════════╪════════════════════════════╪═══╪══════════════════╪═════════╪═══════╪══════════╡\n", "│ 1 ┆ 0 ┆ 3 ┆ Braund, Mr. Owen Harris ┆ … ┆ A/5 21171 ┆ 7.25 ┆ null ┆ S │\n", "│ 2 ┆ 1 ┆ 1 ┆ Cumings, Mrs. John Bradley ┆ … ┆ PC 17599 ┆ 71.2833 ┆ C85 ┆ C │\n", "│ ┆ ┆ ┆ (Fl… ┆ ┆ ┆ ┆ ┆ │\n", "│ 3 ┆ 1 ┆ 3 ┆ Heikkinen, Miss Laina ┆ … ┆ STON/O2. 3101282 ┆ 7.925 ┆ null ┆ S │\n", "└─────────────┴──────────┴────────┴────────────────────────────┴───┴──────────────────┴─────────┴───────┴──────────┘" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic = pl.read_csv('data/titanic.csv')\n", "titanic.head(3)" ] }, { "cell_type": "code", "execution_count": 45, "id": "6df71e89-132f-4423-b3ba-079f986bc9af", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
88903"Johnston, Miss Catherine Helen…"female"null12"W./C. 6607"23.45null"S"
89011"Behr, Mr. Karl Howell""male"26.000"111369"30.0"C148""C"
89103"Dooley, Mr. Patrick""male"32.000"370376"7.75null"Q"
" ], "text/plain": [ "shape: (3, 12)\n", "┌─────────────┬──────────┬────────┬─────────────────────────────────┬───┬────────────┬───────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 ┆ str ┆ ┆ str ┆ f64 ┆ str ┆ str │\n", "╞═════════════╪══════════╪════════╪═════════════════════════════════╪═══╪════════════╪═══════╪═══════╪══════════╡\n", "│ 889 ┆ 0 ┆ 3 ┆ Johnston, Miss Catherine Helen… ┆ … ┆ W./C. 6607 ┆ 23.45 ┆ null ┆ S │\n", "│ 890 ┆ 1 ┆ 1 ┆ Behr, Mr. Karl Howell ┆ … ┆ 111369 ┆ 30.0 ┆ C148 ┆ C │\n", "│ 891 ┆ 0 ┆ 3 ┆ Dooley, Mr. Patrick ┆ … ┆ 370376 ┆ 7.75 ┆ null ┆ Q │\n", "└─────────────┴──────────┴────────┴─────────────────────────────────┴───┴────────────┴───────┴───────┴──────────┘" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.tail(3)" ] }, { "cell_type": "code", "execution_count": 46, "id": "3f445448-f846-4fac-aa6c-d7dbe2b68cb5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Int64,\n", " Int64,\n", " Int64,\n", " String,\n", " String,\n", " Float64,\n", " Int64,\n", " Int64,\n", " String,\n", " Float64,\n", " String,\n", " String]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.dtypes" ] }, { "cell_type": "code", "execution_count": 47, "id": "aebf948b-b1e3-45a8-b620-6e8dcda0bb34", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Schema([('PassengerId', Int64),\n", " ('Survived', Int64),\n", " ('Pclass', Int64),\n", " ('Name', String),\n", " ('Sex', String),\n", " ('Age', Float64),\n", " ('SibSp', Int64),\n", " ('Parch', Int64),\n", " ('Ticket', String),\n", " ('Fare', Float64),\n", " ('Cabin', String),\n", " ('Embarked', String)])" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.schema" ] }, { "cell_type": "code", "execution_count": 48, "id": "42a9e7f7-0f80-4d8e-b67a-85ab9e0a5d7f", "metadata": {}, "outputs": [], "source": [ "titanic.write_excel('data/titanic.xlsx', worksheet='passengers');" ] }, { "cell_type": "code", "execution_count": 49, "id": "dffdbfc4-2146-401e-9662-22b7e54be7c0", "metadata": {}, "outputs": [], "source": [ "titanic = pl.read_excel('data/titanic.xlsx', sheet_name='passengers')" ] }, { "cell_type": "code", "execution_count": 50, "id": "197017b6-ee21-4195-a636-f3fca0327f49", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows: 891\n", "Columns: 12\n", "$ PassengerId 1, 2, 3, 4, 5, 6, 7, 8, 9, 10\n", "$ Survived 0, 1, 1, 1, 0, 0, 0, 0, 1, 1\n", "$ Pclass 3, 1, 3, 1, 3, 3, 1, 3, 3, 2\n", "$ Name 'Braund, Mr. Owen Harris', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'Heikkinen, Miss Laina', 'Futrelle, Mrs. Jacques Heath (Lily May Peel)', 'Allen, Mr. William Henry', 'Moran, Mr. James', 'McCarthy, Mr. Timothy J', 'Palsson, Master Gosta Leonard', 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)', 'Nasser, Mrs. Nicholas (Adele Achem)'\n", "$ Sex 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female'\n", "$ Age 22.0, 38.0, 26.0, 35.0, 35.0, None, 54.0, 2.0, 27.0, 14.0\n", "$ SibSp 1, 1, 0, 1, 0, 0, 0, 3, 0, 1\n", "$ Parch 0, 0, 0, 0, 0, 0, 0, 1, 2, 0\n", "$ Ticket 'A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450', '330877', '17463', '349909', '347742', '237736'\n", "$ Fare 7.25, 71.2833, 7.925, 53.1, 8.05, 8.4583, 51.8625, 21.075, 11.1333, 30.0708\n", "$ Cabin None, 'C85', None, 'C123', None, None, 'E46', None, None, None\n", "$ Embarked 'S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C'\n", "\n" ] } ], "source": [ "# titanic.info()\n", "titanic.glimpse()" ] }, { "cell_type": "code", "execution_count": 51, "id": "1b8f7160-ae8a-4cd0-bbaf-ac4edd3547bd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "85640" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.estimated_size()" ] }, { "cell_type": "code", "execution_count": 52, "id": "7a2d22b3-3153-4d68-8240-ad620fbfc333", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (1, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
u32u32u32u32u32u32u32u32u32u32u32u32
0000017700006872
" ], "text/plain": [ "shape: (1, 12)\n", "┌─────────────┬──────────┬────────┬──────┬───┬────────┬──────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ u32 ┆ u32 ┆ u32 ┆ ┆ u32 ┆ u32 ┆ u32 ┆ u32 │\n", "╞═════════════╪══════════╪════════╪══════╪═══╪════════╪══════╪═══════╪══════════╡\n", "│ 0 ┆ 0 ┆ 0 ┆ 0 ┆ … ┆ 0 ┆ 0 ┆ 687 ┆ 2 │\n", "└─────────────┴──────────┴────────┴──────┴───┴────────┴──────┴───────┴──────────┘" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.null_count()" ] }, { "cell_type": "code", "execution_count": 53, "id": "ef2286ec-5c63-4b2f-95b8-e6b4bca94855", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "891" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.n_unique()" ] }, { "cell_type": "code", "execution_count": 54, "id": "10486b23-f527-4c55-9330-e94ed3db47ae", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (1, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
u32u32u32u32u32u32u32u32u32u32u32u32
89123891289776812481484
" ], "text/plain": [ "shape: (1, 12)\n", "┌─────────────┬──────────┬────────┬──────┬───┬────────┬──────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ u32 ┆ u32 ┆ u32 ┆ ┆ u32 ┆ u32 ┆ u32 ┆ u32 │\n", "╞═════════════╪══════════╪════════╪══════╪═══╪════════╪══════╪═══════╪══════════╡\n", "│ 891 ┆ 2 ┆ 3 ┆ 891 ┆ … ┆ 681 ┆ 248 ┆ 148 ┆ 4 │\n", "└─────────────┴──────────┴────────┴──────┴───┴────────┴──────┴───────┴──────────┘" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.select(pl.all().n_unique())" ] }, { "cell_type": "markdown", "id": "810c9ee2-e4f3-455f-bebe-c9c3cd346ae6", "metadata": {}, "source": [ "## How do I select a subset of a DataFrame?" ] }, { "cell_type": "markdown", "id": "1c54320e-4760-4ca2-a49e-5ad6e9405717", "metadata": {}, "source": [ "### How do I select specific columns from a DataFrame?" ] }, { "cell_type": "code", "execution_count": 55, "id": "329cd9a4-79a8-42c2-a874-30974f4d24db", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5,)
Age
f64
22.0
38.0
26.0
35.0
35.0
" ], "text/plain": [ "shape: (5,)\n", "Series: 'Age' [f64]\n", "[\n", "\t22.0\n", "\t38.0\n", "\t26.0\n", "\t35.0\n", "\t35.0\n", "]" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "age = titanic['Age']\n", "age = titanic.get_column('Age') # or this\n", "age.head(5)" ] }, { "cell_type": "code", "execution_count": 56, "id": "a6892cd1-3629-476c-8e9d-7b1b82006113", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "polars.series.series.Series" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(age)" ] }, { "cell_type": "code", "execution_count": 57, "id": "5681ff91-2388-42e0-a35a-c70ed09a765a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(891,)" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "age.shape" ] }, { "cell_type": "code", "execution_count": 58, "id": "1210ceba-163e-4cd2-b55e-9d3460b1b47d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 2)
AgeSex
f64str
22.0"male"
38.0"female"
26.0"female"
35.0"female"
35.0"male"
" ], "text/plain": [ "shape: (5, 2)\n", "┌──────┬────────┐\n", "│ Age ┆ Sex │\n", "│ --- ┆ --- │\n", "│ f64 ┆ str │\n", "╞══════╪════════╡\n", "│ 22.0 ┆ male │\n", "│ 38.0 ┆ female │\n", "│ 26.0 ┆ female │\n", "│ 35.0 ┆ female │\n", "│ 35.0 ┆ male │\n", "└──────┴────────┘" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "age_sex = titanic[['Age', 'Sex']]\n", "age_sex = titanic.select('Age', 'Sex') #or this\n", "age_sex.head(5)" ] }, { "cell_type": "code", "execution_count": 59, "id": "0ffb3c98-6667-41ed-8fd8-0db871afd81d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "polars.dataframe.frame.DataFrame" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(age_sex)" ] }, { "cell_type": "code", "execution_count": 60, "id": "2c2ca116-fab6-4418-b2d2-c98df835dc26", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(891, 2)" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "age_sex.shape" ] }, { "cell_type": "markdown", "id": "3076219b-f363-4da0-987e-70a1bbc3874d", "metadata": {}, "source": [ "### How do I filter specific rows from a DataFrame?" ] }, { "cell_type": "code", "execution_count": 61, "id": "0cdda9a4-a59f-4dee-b3f2-150a92bbc1d1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
211"Cumings, Mrs. John Bradley (Fl…"female"38.010"PC 17599"71.2833"C85""C"
701"McCarthy, Mr. Timothy J""male"54.000"17463"51.8625"E46""S"
1211"Bonnell, Miss Elizabeth""female"58.000"113783"26.55"C103""S"
1403"Andersson, Mr. Anders Johan""male"39.015"347082"31.275null"S"
1612"Hewlett, Mrs. (Mary D Kingcome…"female"55.000"248706"16.0null"S"
" ], "text/plain": [ "shape: (5, 12)\n", "┌─────────────┬──────────┬────────┬─────────────────────────────────┬───┬──────────┬─────────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 ┆ str ┆ ┆ str ┆ f64 ┆ str ┆ str │\n", "╞═════════════╪══════════╪════════╪═════════════════════════════════╪═══╪══════════╪═════════╪═══════╪══════════╡\n", "│ 2 ┆ 1 ┆ 1 ┆ Cumings, Mrs. John Bradley (Fl… ┆ … ┆ PC 17599 ┆ 71.2833 ┆ C85 ┆ C │\n", "│ 7 ┆ 0 ┆ 1 ┆ McCarthy, Mr. Timothy J ┆ … ┆ 17463 ┆ 51.8625 ┆ E46 ┆ S │\n", "│ 12 ┆ 1 ┆ 1 ┆ Bonnell, Miss Elizabeth ┆ … ┆ 113783 ┆ 26.55 ┆ C103 ┆ S │\n", "│ 14 ┆ 0 ┆ 3 ┆ Andersson, Mr. Anders Johan ┆ … ┆ 347082 ┆ 31.275 ┆ null ┆ S │\n", "│ 16 ┆ 1 ┆ 2 ┆ Hewlett, Mrs. (Mary D Kingcome… ┆ … ┆ 248706 ┆ 16.0 ┆ null ┆ S │\n", "└─────────────┴──────────┴────────┴─────────────────────────────────┴───┴──────────┴─────────┴───────┴──────────┘" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# above_35 = titanic[titanic[\"Age\"] > 35]\n", "above_35 = titanic.filter(pl.col('Age') > 35)\n", "above_35.head(5)" ] }, { "cell_type": "code", "execution_count": 62, "id": "7b608350-15cd-4b43-a502-a5e35548442d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (891,)
Age
bool
false
true
false
false
false
false
false
null
false
false
\n", "shape: (891, 1)
Age
bool
false
true
false
false
false
false
false
null
false
false
\n", "shape: (891,)
Age
bool
false
true
false
false
false
false
false
null
false
false
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# titanic[\"Age\"] > 35\n", "row(\n", " titanic['Age'] > 35,\n", " titanic.select(pl.col('Age') > 35),\n", " titanic.select(pl.col('Age') > 35).to_series()\n", ")" ] }, { "cell_type": "code", "execution_count": 63, "id": "92524cb5-360a-4405-9e90-478e99584808", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(217, 12)" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "above_35.shape" ] }, { "cell_type": "code", "execution_count": 64, "id": "300e732c-a1b1-4047-9cbe-0d27bba6978a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"Braund, Mr. Owen Harris""male"22.010"A/5 21171"7.25null"S"
313"Heikkinen, Miss Laina""female"26.000"STON/O2. 3101282"7.925null"S"
503"Allen, Mr. William Henry""male"35.000"373450"8.05null"S"
603"Moran, Mr. James""male"null00"330877"8.4583null"Q"
803"Palsson, Master Gosta Leonard""male"2.031"349909"21.075null"S"
" ], "text/plain": [ "shape: (5, 12)\n", "┌─────────────┬──────────┬────────┬───────────────────────────────┬───┬──────────────────┬────────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 ┆ str ┆ ┆ str ┆ f64 ┆ str ┆ str │\n", "╞═════════════╪══════════╪════════╪═══════════════════════════════╪═══╪══════════════════╪════════╪═══════╪══════════╡\n", "│ 1 ┆ 0 ┆ 3 ┆ Braund, Mr. Owen Harris ┆ … ┆ A/5 21171 ┆ 7.25 ┆ null ┆ S │\n", "│ 3 ┆ 1 ┆ 3 ┆ Heikkinen, Miss Laina ┆ … ┆ STON/O2. 3101282 ┆ 7.925 ┆ null ┆ S │\n", "│ 5 ┆ 0 ┆ 3 ┆ Allen, Mr. William Henry ┆ … ┆ 373450 ┆ 8.05 ┆ null ┆ S │\n", "│ 6 ┆ 0 ┆ 3 ┆ Moran, Mr. James ┆ … ┆ 330877 ┆ 8.4583 ┆ null ┆ Q │\n", "│ 8 ┆ 0 ┆ 3 ┆ Palsson, Master Gosta Leonard ┆ … ┆ 349909 ┆ 21.075 ┆ null ┆ S │\n", "└─────────────┴──────────┴────────┴───────────────────────────────┴───┴──────────────────┴────────┴───────┴──────────┘" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# class_23 = titanic[titanic[\"Pclass\"].isin([2, 3])]\n", "class_23 = titanic.filter(pl.col('Pclass').is_in([2, 3]))\n", "class_23.head(5)" ] }, { "cell_type": "code", "execution_count": 65, "id": "ced5579e-54ff-434a-ba08-f24986e81eba", "metadata": {}, "outputs": [], "source": [ "# class_23 = titanic[(titanic[\"Pclass\"] == 2) | (titanic[\"Pclass\"] == 3)]\n", "class_23 = titanic.filter(\n", " (pl.col('Pclass') == 2) | (pl.col('Pclass') == 3)\n", ")\n", "\n", "class_23 = titanic.filter(\n", " pl.col('Pclass').eq(2).or_(pl.col('Pclass').eq(3))\n", ")" ] }, { "cell_type": "code", "execution_count": 66, "id": "b29295cb-71b3-4ba1-a515-a93fbaaeb2d6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"Braund, Mr. Owen Harris""male"22.010"A/5 21171"7.25null"S"
211"Cumings, Mrs. John Bradley (Fl…"female"38.010"PC 17599"71.2833"C85""C"
313"Heikkinen, Miss Laina""female"26.000"STON/O2. 3101282"7.925null"S"
411"Futrelle, Mrs. Jacques Heath (…"female"35.010"113803"53.1"C123""S"
503"Allen, Mr. William Henry""male"35.000"373450"8.05null"S"
" ], "text/plain": [ "shape: (5, 12)\n", "┌─────────────┬──────────┬────────┬────────────────────────────────┬───┬──────────────────┬─────────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 ┆ str ┆ ┆ str ┆ f64 ┆ str ┆ str │\n", "╞═════════════╪══════════╪════════╪════════════════════════════════╪═══╪══════════════════╪═════════╪═══════╪══════════╡\n", "│ 1 ┆ 0 ┆ 3 ┆ Braund, Mr. Owen Harris ┆ … ┆ A/5 21171 ┆ 7.25 ┆ null ┆ S │\n", "│ 2 ┆ 1 ┆ 1 ┆ Cumings, Mrs. John Bradley ┆ … ┆ PC 17599 ┆ 71.2833 ┆ C85 ┆ C │\n", "│ ┆ ┆ ┆ (Fl… ┆ ┆ ┆ ┆ ┆ │\n", "│ 3 ┆ 1 ┆ 3 ┆ Heikkinen, Miss Laina ┆ … ┆ STON/O2. 3101282 ┆ 7.925 ┆ null ┆ S │\n", "│ 4 ┆ 1 ┆ 1 ┆ Futrelle, Mrs. Jacques Heath ┆ … ┆ 113803 ┆ 53.1 ┆ C123 ┆ S │\n", "│ ┆ ┆ ┆ (… ┆ ┆ ┆ ┆ ┆ │\n", "│ 5 ┆ 0 ┆ 3 ┆ Allen, Mr. William Henry ┆ … ┆ 373450 ┆ 8.05 ┆ null ┆ S │\n", "└─────────────┴──────────┴────────┴────────────────────────────────┴───┴──────────────────┴─────────┴───────┴──────────┘" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# age_no_na = titanic[titanic[\"Age\"].notna()]\n", "age_no_na = titanic.filter(pl.col('Age').is_not_null())\n", "age_no_na.head()" ] }, { "cell_type": "code", "execution_count": 67, "id": "8a148344-eecd-4a75-b8a4-5e6667cf4cbd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(714, 12)" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "age_no_na.shape" ] }, { "cell_type": "markdown", "id": "b7bf7bbe-517a-4ba7-b817-83b95d0c533a", "metadata": {}, "source": [ "### How do I select specific rows and columns from a DataFrame?" ] }, { "cell_type": "code", "execution_count": 68, "id": "6831d6f3-64c4-4109-ab9e-9ff8b29de151", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5,)
Name
str
"Cumings, Mrs. John Bradley (Fl…
"McCarthy, Mr. Timothy J"
"Bonnell, Miss Elizabeth"
"Andersson, Mr. Anders Johan"
"Hewlett, Mrs. (Mary D Kingcome…
" ], "text/plain": [ "shape: (5,)\n", "Series: 'Name' [str]\n", "[\n", "\t\"Cumings, Mrs. John Bradley (Fl…\n", "\t\"McCarthy, Mr. Timothy J\"\n", "\t\"Bonnell, Miss Elizabeth\"\n", "\t\"Andersson, Mr. Anders Johan\"\n", "\t\"Hewlett, Mrs. (Mary D Kingcome…\n", "]" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# adult_names = titanic.loc[titanic[\"Age\"] > 35, \"Name\"]\n", "adult_names = (\n", " titanic\n", " .filter(pl.col('Age') > 35)\n", " .select('Name')\n", " .to_series()\n", ")\n", "adult_names.head(5)" ] }, { "cell_type": "code", "execution_count": 69, "id": "1e1549f8-603a-41e3-83ad-9a06a7ff37b6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (16, 3)
PclassNameSex
i64strstr
2"Nasser, Mrs. Nicholas (Adele A…"female"
3"Sandstrom, Miss Marguerite Rut""female"
1"Bonnell, Miss Elizabeth""female"
3"Saundercock, Mr. William Henry""male"
3"Andersson, Mr. Anders Johan""male"
2"Fynney, Mr. Joseph J""male"
2"Beesley, Mr. Lawrence""male"
3"McGowan, Miss Anna "Annie"""female"
1"Sloper, Mr. William Thompson""male"
3"Palsson, Miss Torborg Danira""female"
" ], "text/plain": [ "shape: (16, 3)\n", "┌────────┬─────────────────────────────────┬────────┐\n", "│ Pclass ┆ Name ┆ Sex │\n", "│ --- ┆ --- ┆ --- │\n", "│ i64 ┆ str ┆ str │\n", "╞════════╪═════════════════════════════════╪════════╡\n", "│ 2 ┆ Nasser, Mrs. Nicholas (Adele A… ┆ female │\n", "│ 3 ┆ Sandstrom, Miss Marguerite Rut ┆ female │\n", "│ 1 ┆ Bonnell, Miss Elizabeth ┆ female │\n", "│ 3 ┆ Saundercock, Mr. William Henry ┆ male │\n", "│ 3 ┆ Andersson, Mr. Anders Johan ┆ male │\n", "│ … ┆ … ┆ … │\n", "│ 2 ┆ Fynney, Mr. Joseph J ┆ male │\n", "│ 2 ┆ Beesley, Mr. Lawrence ┆ male │\n", "│ 3 ┆ McGowan, Miss Anna \"Annie\" ┆ female │\n", "│ 1 ┆ Sloper, Mr. William Thompson ┆ male │\n", "│ 3 ┆ Palsson, Miss Torborg Danira ┆ female │\n", "└────────┴─────────────────────────────────┴────────┘" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic.iloc[9:25, 2:5]\n", "titanic.select(pl.nth(range(2, 5)).slice(9, 25 - 9))" ] }, { "cell_type": "code", "execution_count": 70, "id": "0d94c3d3-3d60-4941-a09c-89add4c24eff", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"anonymous""male"22.010"A/5 21171"7.25null"S"
211"anonymous""female"38.010"PC 17599"71.2833"C85""C"
313"anonymous""female"26.000"STON/O2. 3101282"7.925null"S"
411"Futrelle, Mrs. Jacques Heath (…"female"35.010"113803"53.1"C123""S"
503"Allen, Mr. William Henry""male"35.000"373450"8.05null"S"
" ], "text/plain": [ "shape: (5, 12)\n", "┌─────────────┬──────────┬────────┬────────────────────────────────┬───┬──────────────────┬─────────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 ┆ str ┆ ┆ str ┆ f64 ┆ str ┆ str │\n", "╞═════════════╪══════════╪════════╪════════════════════════════════╪═══╪══════════════════╪═════════╪═══════╪══════════╡\n", "│ 1 ┆ 0 ┆ 3 ┆ anonymous ┆ … ┆ A/5 21171 ┆ 7.25 ┆ null ┆ S │\n", "│ 2 ┆ 1 ┆ 1 ┆ anonymous ┆ … ┆ PC 17599 ┆ 71.2833 ┆ C85 ┆ C │\n", "│ 3 ┆ 1 ┆ 3 ┆ anonymous ┆ … ┆ STON/O2. 3101282 ┆ 7.925 ┆ null ┆ S │\n", "│ 4 ┆ 1 ┆ 1 ┆ Futrelle, Mrs. Jacques Heath ┆ … ┆ 113803 ┆ 53.1 ┆ C123 ┆ S │\n", "│ ┆ ┆ ┆ (… ┆ ┆ ┆ ┆ ┆ │\n", "│ 5 ┆ 0 ┆ 3 ┆ Allen, Mr. William Henry ┆ … ┆ 373450 ┆ 8.05 ┆ null ┆ S │\n", "└─────────────┴──────────┴────────┴────────────────────────────────┴───┴──────────────────┴─────────┴───────┴──────────┘" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic.iloc[0:3, 3] = \"anonymous\"\n", "titanic = titanic.with_columns(\n", " pl.when(pl.int_range(pl.len()) < 3)\n", " .then(pl.lit('anonymous'))\n", " .otherwise(pl.nth(3))\n", " .name.keep()\n", ")\n", "titanic.head(5)" ] }, { "cell_type": "markdown", "id": "1b70c488-4448-4597-951f-b542e0011a55", "metadata": {}, "source": [ "## How do I create plots in pandas?" ] }, { "cell_type": "code", "execution_count": 71, "id": "513b3fcd-59c2-468f-98a1-6721607e62c4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 4)
datetimestation_antwerpstation_parisstation_london
datetime[μs]f64f64f64
2019-05-07 02:00:00nullnull23.0
2019-05-07 03:00:0050.525.019.0
2019-05-07 04:00:0045.027.719.0
2019-05-07 05:00:00null50.416.0
2019-05-07 06:00:00null61.9null
" ], "text/plain": [ "shape: (5, 4)\n", "┌─────────────────────┬─────────────────┬───────────────┬────────────────┐\n", "│ datetime ┆ station_antwerp ┆ station_paris ┆ station_london │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs] ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════╪═════════════════╪═══════════════╪════════════════╡\n", "│ 2019-05-07 02:00:00 ┆ null ┆ null ┆ 23.0 │\n", "│ 2019-05-07 03:00:00 ┆ 50.5 ┆ 25.0 ┆ 19.0 │\n", "│ 2019-05-07 04:00:00 ┆ 45.0 ┆ 27.7 ┆ 19.0 │\n", "│ 2019-05-07 05:00:00 ┆ null ┆ 50.4 ┆ 16.0 │\n", "│ 2019-05-07 06:00:00 ┆ null ┆ 61.9 ┆ null │\n", "└─────────────────────┴─────────────────┴───────────────┴────────────────┘" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality = pl.read_csv(\"data/air_quality_no2.csv\", try_parse_dates=True)\n", "air_quality.head(5)" ] }, { "cell_type": "code", "execution_count": 72, "id": "5d3400d7-0419-4299-9ce7-776b10619063", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":NdOverlay [Variable]\n", " :Curve [datetime] (value)" ] }, "execution_count": 72, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p1004" } }, "output_type": "execute_result" } ], "source": [ "air_quality.hvplot.line(x='datetime')" ] }, { "cell_type": "code", "execution_count": 73, "id": "9096c1c3-ca0a-41af-a5ce-7e58c87c8ed5", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":Curve [datetime] (station_paris)" ] }, "execution_count": 73, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p1119" } }, "output_type": "execute_result" } ], "source": [ "air_quality.hvplot.line(x='datetime', y='station_paris')" ] }, { "cell_type": "code", "execution_count": 74, "id": "26770aa7-0b54-4753-88b9-1157a3d21cee", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":Scatter [station_london] (station_paris)" ] }, "execution_count": 74, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p1194" } }, "output_type": "execute_result" } ], "source": [ "air_quality.hvplot.scatter(x=\"station_london\", y=\"station_paris\", alpha=0.5)" ] }, { "cell_type": "code", "execution_count": 75, "id": "d9df3145-e550-4f8a-8916-cb1013afee3a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['area',\n", " 'bar',\n", " 'barh',\n", " 'bivariate',\n", " 'box',\n", " 'dataset',\n", " 'density',\n", " 'errorbars',\n", " 'explorer',\n", " 'heatmap',\n", " 'hexbin',\n", " 'hist',\n", " 'kde',\n", " 'labels',\n", " 'line',\n", " 'ohlc',\n", " 'paths',\n", " 'points',\n", " 'polygons',\n", " 'scatter',\n", " 'step',\n", " 'table',\n", " 'vectorfield',\n", " 'violin']" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[\n", " method_name\n", " for method_name in dir(air_quality.hvplot)\n", " if not method_name.startswith(\"_\")\n", "]" ] }, { "cell_type": "code", "execution_count": 76, "id": "7a0185d1-2c5e-4c4a-a23b-156173beb42e", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":BoxWhisker [Variable] (value)" ] }, "execution_count": 76, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p1257" } }, "output_type": "execute_result" } ], "source": [ "air_quality.hvplot.box()" ] }, { "cell_type": "code", "execution_count": 77, "id": "16fc8c29-2a7e-45e5-a218-a7c20759b996", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":NdLayout [station]\n", " :Area [datetime] (quality)" ] }, "execution_count": 77, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p1406" } }, "output_type": "execute_result" } ], "source": [ "# air_quality.plot.area(figsize=(12, 4), subplots=True)\n", "(\n", "air_quality\n", ".unpivot(index=\"datetime\", variable_name=\"station\", value_name=\"quality\")\n", ".hvplot.area(x='datetime', y='quality', groupby=\"station\")\n", ".opts(height=180)\n", ".layout()\n", ".cols(1)\n", ")" ] }, { "cell_type": "code", "execution_count": 78, "id": "366045e1-6157-4e64-a4a6-a7df384cf475", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":NdOverlay [station]\n", " :Area [datetime] (NO2 concentration)" ] }, "execution_count": 78, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p1617" } }, "output_type": "execute_result" } ], "source": [ "fig = (\n", "air_quality\n", ".unpivot(index=\"datetime\", variable_name=\"station\", value_name=\"quality\")\n", ".with_columns(\n", " pl.col('quality').fill_null(0)\n", ")\n", ".hvplot.area(x='datetime', y='quality', groupby='station')\n", ".redim(quality=r\"NO2 concentration\")\n", ".opts(alpha=0.8)\n", ".overlay()\n", ")\n", "fig" ] }, { "cell_type": "code", "execution_count": 79, "id": "fbaff033-11ce-4d74-9515-743d86b6edf9", "metadata": {}, "outputs": [], "source": [ "hv.save(fig, 'data/no2_concentrations.html')" ] }, { "cell_type": "markdown", "id": "c1e25eed-4a9d-46b4-a901-94afc8eb6af7", "metadata": {}, "source": [ "## How to create new columns derived from existing columns" ] }, { "cell_type": "code", "execution_count": 80, "id": "7ab5348a-d00e-4e26-92a1-490895a0908e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 4)
datetimestation_antwerpstation_parisstation_london
datetime[μs]f64f64f64
2019-05-07 02:00:00nullnull23.0
2019-05-07 03:00:0050.525.019.0
2019-05-07 04:00:0045.027.719.0
2019-05-07 05:00:00null50.416.0
2019-05-07 06:00:00null61.9null
" ], "text/plain": [ "shape: (5, 4)\n", "┌─────────────────────┬─────────────────┬───────────────┬────────────────┐\n", "│ datetime ┆ station_antwerp ┆ station_paris ┆ station_london │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs] ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════╪═════════════════╪═══════════════╪════════════════╡\n", "│ 2019-05-07 02:00:00 ┆ null ┆ null ┆ 23.0 │\n", "│ 2019-05-07 03:00:00 ┆ 50.5 ┆ 25.0 ┆ 19.0 │\n", "│ 2019-05-07 04:00:00 ┆ 45.0 ┆ 27.7 ┆ 19.0 │\n", "│ 2019-05-07 05:00:00 ┆ null ┆ 50.4 ┆ 16.0 │\n", "│ 2019-05-07 06:00:00 ┆ null ┆ 61.9 ┆ null │\n", "└─────────────────────┴─────────────────┴───────────────┴────────────────┘" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality = pl.read_csv(\"data/air_quality_no2.csv\", try_parse_dates=True)\n", "air_quality.head(5)" ] }, { "cell_type": "code", "execution_count": 81, "id": "2169944f-b269-4a1f-883f-c0650faa0b48", "metadata": {}, "outputs": [], "source": [ "# air_quality[\"london_mg_per_cubic\"] = air_quality[\"station_london\"] * 1.88\n", "air_quality = air_quality.with_columns(\n", " london_mg_per_cubic = pl.col(\"station_london\") * 1.882\n", ")" ] }, { "cell_type": "code", "execution_count": 82, "id": "68441a39-8306-43bb-8ffc-2e37149bc910", "metadata": {}, "outputs": [], "source": [ "# air_quality[\"ratio_paris_antwerp\"] = (\n", "# air_quality[\"station_paris\"] / air_quality[\"station_antwerp\"]\n", "# )\n", "\n", "air_quality = air_quality.with_columns(\n", " ratio_paris_antwerp = pl.col(\"station_paris\") / pl.col(\"station_antwerp\")\n", ")" ] }, { "cell_type": "code", "execution_count": 83, "id": "b62bd450-1cfa-4150-b2cb-415ca680490e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 6)
datetimeBETR801FR04014London Westminsterlondon_mg_per_cubicratio_paris_antwerp
datetime[μs]f64f64f64f64f64
2019-05-07 02:00:00nullnull23.043.286null
2019-05-07 03:00:0050.525.019.035.7580.49505
2019-05-07 04:00:0045.027.719.035.7580.615556
2019-05-07 05:00:00null50.416.030.112null
2019-05-07 06:00:00null61.9nullnullnull
" ], "text/plain": [ "shape: (5, 6)\n", "┌─────────────────────┬─────────┬─────────┬────────────────────┬─────────────────────┬─────────────────────┐\n", "│ datetime ┆ BETR801 ┆ FR04014 ┆ London Westminster ┆ london_mg_per_cubic ┆ ratio_paris_antwerp │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs] ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════╪═════════╪═════════╪════════════════════╪═════════════════════╪═════════════════════╡\n", "│ 2019-05-07 02:00:00 ┆ null ┆ null ┆ 23.0 ┆ 43.286 ┆ null │\n", "│ 2019-05-07 03:00:00 ┆ 50.5 ┆ 25.0 ┆ 19.0 ┆ 35.758 ┆ 0.49505 │\n", "│ 2019-05-07 04:00:00 ┆ 45.0 ┆ 27.7 ┆ 19.0 ┆ 35.758 ┆ 0.615556 │\n", "│ 2019-05-07 05:00:00 ┆ null ┆ 50.4 ┆ 16.0 ┆ 30.112 ┆ null │\n", "│ 2019-05-07 06:00:00 ┆ null ┆ 61.9 ┆ null ┆ null ┆ null │\n", "└─────────────────────┴─────────┴─────────┴────────────────────┴─────────────────────┴─────────────────────┘" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality_renamed = air_quality.rename(\n", " {\n", " \"station_antwerp\": \"BETR801\",\n", " \"station_paris\": \"FR04014\",\n", " \"station_london\": \"London Westminster\",\n", " }\n", ")\n", "air_quality_renamed.head(5)" ] }, { "cell_type": "code", "execution_count": 84, "id": "83cefab5-caac-4e7b-830e-22cf6bcc97e2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 6)
datetimebetr801fr04014london westminsterlondon_mg_per_cubicratio_paris_antwerp
datetime[μs]f64f64f64f64f64
2019-05-07 02:00:00nullnull23.043.286null
2019-05-07 03:00:0050.525.019.035.7580.49505
2019-05-07 04:00:0045.027.719.035.7580.615556
2019-05-07 05:00:00null50.416.030.112null
2019-05-07 06:00:00null61.9nullnullnull
" ], "text/plain": [ "shape: (5, 6)\n", "┌─────────────────────┬─────────┬─────────┬────────────────────┬─────────────────────┬─────────────────────┐\n", "│ datetime ┆ betr801 ┆ fr04014 ┆ london westminster ┆ london_mg_per_cubic ┆ ratio_paris_antwerp │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs] ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════╪═════════╪═════════╪════════════════════╪═════════════════════╪═════════════════════╡\n", "│ 2019-05-07 02:00:00 ┆ null ┆ null ┆ 23.0 ┆ 43.286 ┆ null │\n", "│ 2019-05-07 03:00:00 ┆ 50.5 ┆ 25.0 ┆ 19.0 ┆ 35.758 ┆ 0.49505 │\n", "│ 2019-05-07 04:00:00 ┆ 45.0 ┆ 27.7 ┆ 19.0 ┆ 35.758 ┆ 0.615556 │\n", "│ 2019-05-07 05:00:00 ┆ null ┆ 50.4 ┆ 16.0 ┆ 30.112 ┆ null │\n", "│ 2019-05-07 06:00:00 ┆ null ┆ 61.9 ┆ null ┆ null ┆ null │\n", "└─────────────────────┴─────────┴─────────┴────────────────────┴─────────────────────┴─────────────────────┘" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality_renamed = air_quality_renamed.rename(str.lower)\n", "air_quality_renamed.head(5)" ] }, { "cell_type": "markdown", "id": "8bc5f7ee-c2be-4eb6-90d7-0a0bd6e473f5", "metadata": {}, "source": [ "## How to calculate summary statistics" ] }, { "cell_type": "markdown", "id": "4a991768-4f54-436a-8d4a-fb4dd0299844", "metadata": {}, "source": [ "### Aggregating statistics" ] }, { "cell_type": "code", "execution_count": 85, "id": "abdae0b3-a4a6-4c44-a747-abfe25ff8d6d", "metadata": {}, "outputs": [], "source": [ "titanic = pl.read_csv(\"data/titanic.csv\")" ] }, { "cell_type": "code", "execution_count": 86, "id": "54c396d9-910d-4efa-bd14-4d3e3d9600a1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(29.69911764705882, 29.69911764705882)" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(\n", " titanic['Age'].mean(),\n", " titanic.select(pl.col('Age').mean()).item() #or this\n", ")" ] }, { "cell_type": "code", "execution_count": 87, "id": "2a1ec433-5ffd-4629-956f-20471be9e431", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (1, 2)
AgeFare
f64f64
28.014.4542
\n", "shape: (1, 2)
AgeFare
f64f64
28.014.4542
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "res1 = titanic[[\"Age\", \"Fare\"]].median()\n", "res2 = titanic.select(pl.col('Age', 'Fare').median())\n", "row(res1, res2)" ] }, { "cell_type": "code", "execution_count": 88, "id": "0f8a16e3-d460-44bc-a971-e3f49c36b97a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (2, 2)
columnValue
strf64
"Age"28.0
"Fare"14.4542
" ], "text/plain": [ "shape: (2, 2)\n", "┌────────┬─────────┐\n", "│ column ┆ Value │\n", "│ --- ┆ --- │\n", "│ str ┆ f64 │\n", "╞════════╪═════════╡\n", "│ Age ┆ 28.0 │\n", "│ Fare ┆ 14.4542 │\n", "└────────┴─────────┘" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res1.transpose(include_header=True, column_names=['Value'])" ] }, { "cell_type": "code", "execution_count": 89, "id": "dd90fb8a-d1bb-4de2-a80a-58fa146993d5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (9, 3)
statisticAgeFare
strf64f64
"count"714.0891.0
"null_count"177.00.0
"mean"29.69911832.204208
"std"14.52649749.693429
"min"0.420.0
"25%"20.07.925
"50%"28.014.4542
"75%"38.031.0
"max"80.0512.3292
" ], "text/plain": [ "shape: (9, 3)\n", "┌────────────┬───────────┬───────────┐\n", "│ statistic ┆ Age ┆ Fare │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 │\n", "╞════════════╪═══════════╪═══════════╡\n", "│ count ┆ 714.0 ┆ 891.0 │\n", "│ null_count ┆ 177.0 ┆ 0.0 │\n", "│ mean ┆ 29.699118 ┆ 32.204208 │\n", "│ std ┆ 14.526497 ┆ 49.693429 │\n", "│ min ┆ 0.42 ┆ 0.0 │\n", "│ 25% ┆ 20.0 ┆ 7.925 │\n", "│ 50% ┆ 28.0 ┆ 14.4542 │\n", "│ 75% ┆ 38.0 ┆ 31.0 │\n", "│ max ┆ 80.0 ┆ 512.3292 │\n", "└────────────┴───────────┴───────────┘" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic[[\"Age\", \"Fare\"]].describe()" ] }, { "cell_type": "code", "execution_count": 90, "id": "9be2688b-01de-4fb8-9adb-924529bbb7e0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (2, 6)
columnminmaxmedianskewmean
strf64f64f64f64f64
"Age"0.4280.028.00.38829null
"Fare"0.0512.329214.4542null32.204208
" ], "text/plain": [ "shape: (2, 6)\n", "┌────────┬──────┬──────────┬─────────┬─────────┬───────────┐\n", "│ column ┆ min ┆ max ┆ median ┆ skew ┆ mean │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞════════╪══════╪══════════╪═════════╪═════════╪═══════════╡\n", "│ Age ┆ 0.42 ┆ 80.0 ┆ 28.0 ┆ 0.38829 ┆ null │\n", "│ Fare ┆ 0.0 ┆ 512.3292 ┆ 14.4542 ┆ null ┆ 32.204208 │\n", "└────────┴──────┴──────────┴─────────┴─────────┴───────────┘" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic.agg(\n", "# {\n", "# \"Age\": [\"min\", \"max\", \"median\", \"skew\"],\n", "# \"Fare\": [\"min\", \"max\", \"median\", \"mean\"],\n", "# }\n", "# )\n", "\n", "age, fare = [pl.col(name) for name in ['Age', 'Fare']]\n", "(\n", "titanic\n", ".select(\n", " Age=pl.struct(\n", " min=age.min(),\n", " max=age.max(),\n", " median=age.median(),\n", " skew=age.skew()\n", " ),\n", " Fare=pl.struct(\n", " min=fare.min(),\n", " max=fare.max(),\n", " median=fare.median(),\n", " mean=fare.mean()\n", " )\n", ")\n", ".transpose(include_header=True, column_names=['agg'])\n", ".unnest('agg')\n", ")" ] }, { "cell_type": "code", "execution_count": 91, "id": "eacafae8-d6d0-4a5a-9c25-499ecf01234d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (2, 6)
columnminmaxmedianskewmean
strf64f64f64f64f64
"Age"0.4280.028.00.38829null
"Fare"0.0512.329214.4542null32.204208
" ], "text/plain": [ "shape: (2, 6)\n", "┌────────┬──────┬──────────┬─────────┬─────────┬───────────┐\n", "│ column ┆ min ┆ max ┆ median ┆ skew ┆ mean │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞════════╪══════╪══════════╪═════════╪═════════╪═══════════╡\n", "│ Age ┆ 0.42 ┆ 80.0 ┆ 28.0 ┆ 0.38829 ┆ null │\n", "│ Fare ┆ 0.0 ┆ 512.3292 ┆ 14.4542 ┆ null ┆ 32.204208 │\n", "└────────┴──────┴──────────┴─────────┴─────────┴───────────┘" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def agg(df, items):\n", " exprs = {\n", " key: pl.struct(**{agg_name: getattr(pl.col(key), agg_name)() for agg_name in agg_names})\n", " for key, agg_names in items.items()\n", " }\n", " return df.select(**exprs).transpose(include_header=True, column_names=[\"agg\"]).unnest(\"agg\")\n", "\n", "\n", "agg(\n", " titanic,\n", " {\n", " \"Age\": [\"min\", \"max\", \"median\", \"skew\"],\n", " \"Fare\": [\"min\", \"max\", \"median\", \"mean\"],\n", " },\n", ")" ] }, { "cell_type": "markdown", "id": "3dded75f-1fcc-439b-9d02-b634ece64d9d", "metadata": {}, "source": [ "### Aggregating statistics grouped by category" ] }, { "cell_type": "code", "execution_count": 92, "id": "11907b19-ef16-4bb0-9c2b-881740fe2727", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (2, 2)
SexAge
strf64
"female"27.915709
"male"30.726645
" ], "text/plain": [ "shape: (2, 2)\n", "┌────────┬───────────┐\n", "│ Sex ┆ Age │\n", "│ --- ┆ --- │\n", "│ str ┆ f64 │\n", "╞════════╪═══════════╡\n", "│ female ┆ 27.915709 │\n", "│ male ┆ 30.726645 │\n", "└────────┴───────────┘" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic[[\"Sex\", \"Age\"]].groupby(\"Sex\").mean()\n", "titanic.group_by('Sex').agg(pl.col('Age').mean())" ] }, { "cell_type": "code", "execution_count": 93, "id": "daabd5b9-b862-45ce-838a-e98674c9e1d6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (2, 8)
SexPassengerIdSurvivedPclassAgeSibSpParchFare
strf64f64f64f64f64f64f64
"female"431.0286620.7420382.15923627.9157090.6942680.64968244.479818
"male"454.1473140.1889082.38994830.7266450.4298090.23570225.523893
" ], "text/plain": [ "shape: (2, 8)\n", "┌────────┬─────────────┬──────────┬──────────┬───────────┬──────────┬──────────┬───────────┐\n", "│ Sex ┆ PassengerId ┆ Survived ┆ Pclass ┆ Age ┆ SibSp ┆ Parch ┆ Fare │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞════════╪═════════════╪══════════╪══════════╪═══════════╪══════════╪══════════╪═══════════╡\n", "│ female ┆ 431.028662 ┆ 0.742038 ┆ 2.159236 ┆ 27.915709 ┆ 0.694268 ┆ 0.649682 ┆ 44.479818 │\n", "│ male ┆ 454.147314 ┆ 0.188908 ┆ 2.389948 ┆ 30.726645 ┆ 0.429809 ┆ 0.235702 ┆ 25.523893 │\n", "└────────┴─────────────┴──────────┴──────────┴───────────┴──────────┴──────────┴───────────┘" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic.groupby(\"Sex\").mean(numeric_only=True)\n", "\n", "from polars import selectors as cs\n", "titanic.group_by('Sex').agg(cs.numeric().mean())" ] }, { "cell_type": "code", "execution_count": 94, "id": "37082ca7-0d09-4c14-b03e-cf39adc10eb5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (6, 3)
SexPclassFare
stri64f64
"female"1106.125798
"female"221.970121
"female"316.11881
"male"167.226127
"male"219.741782
"male"312.661633
" ], "text/plain": [ "shape: (6, 3)\n", "┌────────┬────────┬────────────┐\n", "│ Sex ┆ Pclass ┆ Fare │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i64 ┆ f64 │\n", "╞════════╪════════╪════════════╡\n", "│ female ┆ 1 ┆ 106.125798 │\n", "│ female ┆ 2 ┆ 21.970121 │\n", "│ female ┆ 3 ┆ 16.11881 │\n", "│ male ┆ 1 ┆ 67.226127 │\n", "│ male ┆ 2 ┆ 19.741782 │\n", "│ male ┆ 3 ┆ 12.661633 │\n", "└────────┴────────┴────────────┘" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic.groupby([\"Sex\", \"Pclass\"])[\"Fare\"].mean()\n", "(\n", "titanic\n", ".group_by('Sex', 'Pclass')\n", ".agg(pl.col('Fare').mean())\n", ".sort('Sex', 'Pclass')\n", ")" ] }, { "cell_type": "markdown", "id": "e9410f89-6999-4c69-bf1f-31ae1ff38c1c", "metadata": {}, "source": [ "### Count number of records by category" ] }, { "cell_type": "code", "execution_count": 95, "id": "1548938d-2353-4d32-8f84-7a686f3a84fe", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3, 2)
Pclasscount
i64u32
3491
1216
2184
\n", "shape: (3, 2)
Pclasscount
i64u32
3491
1216
2184
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# titanic[\"Pclass\"].value_counts()\n", "res1 = titanic['Pclass'].value_counts().sort('count', descending=True)\n", "res2 = (\n", "titanic\n", ".select(\n", " pl.col('Pclass')\n", " .value_counts()\n", " .struct.unnest()\n", ")\n", ".sort('count', descending=True)\n", ")\n", "row(res1, res2)" ] }, { "cell_type": "code", "execution_count": 96, "id": "7b8ba833-4670-4ad6-9edf-5e35d630cb16", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3, 2)
Pclasscount
i64u32
3491
2184
1216
" ], "text/plain": [ "shape: (3, 2)\n", "┌────────┬───────┐\n", "│ Pclass ┆ count │\n", "│ --- ┆ --- │\n", "│ i64 ┆ u32 │\n", "╞════════╪═══════╡\n", "│ 3 ┆ 491 │\n", "│ 2 ┆ 184 │\n", "│ 1 ┆ 216 │\n", "└────────┴───────┘" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic.groupby(\"Pclass\")[\"Pclass\"].count()\n", "titanic.group_by('Pclass').agg(count=pl.len())" ] }, { "cell_type": "markdown", "id": "3b61c9de-539f-4e09-a511-0d9b04a540cb", "metadata": {}, "source": [ "## How to reshape the layout of tables" ] }, { "cell_type": "code", "execution_count": 97, "id": "73868101-ac6d-4ec1-9940-587363069109", "metadata": {}, "outputs": [], "source": [ "titanic = pl.read_csv(\"data/titanic.csv\")\n", "air_quality = pl.read_csv(\"data/air_quality_long.csv\", try_parse_dates=True)" ] }, { "cell_type": "markdown", "id": "018edcca-898c-4131-84f3-3d3778194729", "metadata": {}, "source": [ "### Sort table rows" ] }, { "cell_type": "code", "execution_count": 98, "id": "e77becc9-69b3-4700-a68f-89005e02a23d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
80413"Thomas, Master Assad Alexander""male"0.4201"2625"8.5167null"C"
75612"Hamalainen, Master Viljo""male"0.6711"250649"14.5null"S"
47013"Baclini, Miss Helene Barbara""female"0.7521"2666"19.2583null"C"
64513"Baclini, Miss Eugenie""female"0.7521"2666"19.2583null"C"
7912"Caldwell, Master Alden Gates""male"0.8302"248738"29.0null"S"
" ], "text/plain": [ "shape: (5, 12)\n", "┌─────────────┬──────────┬────────┬────────────────────────────────┬───┬────────┬─────────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 ┆ str ┆ ┆ str ┆ f64 ┆ str ┆ str │\n", "╞═════════════╪══════════╪════════╪════════════════════════════════╪═══╪════════╪═════════╪═══════╪══════════╡\n", "│ 804 ┆ 1 ┆ 3 ┆ Thomas, Master Assad Alexander ┆ … ┆ 2625 ┆ 8.5167 ┆ null ┆ C │\n", "│ 756 ┆ 1 ┆ 2 ┆ Hamalainen, Master Viljo ┆ … ┆ 250649 ┆ 14.5 ┆ null ┆ S │\n", "│ 470 ┆ 1 ┆ 3 ┆ Baclini, Miss Helene Barbara ┆ … ┆ 2666 ┆ 19.2583 ┆ null ┆ C │\n", "│ 645 ┆ 1 ┆ 3 ┆ Baclini, Miss Eugenie ┆ … ┆ 2666 ┆ 19.2583 ┆ null ┆ C │\n", "│ 79 ┆ 1 ┆ 2 ┆ Caldwell, Master Alden Gates ┆ … ┆ 248738 ┆ 29.0 ┆ null ┆ S │\n", "└─────────────┴──────────┴────────┴────────────────────────────────┴───┴────────┴─────────┴───────┴──────────┘" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.sort('Age', nulls_last=True).head(5)" ] }, { "cell_type": "code", "execution_count": 99, "id": "6b61723b-63cb-4a68-b6c3-eb6746af17ac", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
85203"Svensson, Mr. Johan""male"74.000"347060"7.775null"S"
11703"Connors, Mr. Patrick""male"70.500"370369"7.75null"Q"
28103"Duane, Mr. Frank""male"65.000"336439"7.75null"Q"
48413"Turkula, Mrs. (Hedwig)""female"63.000"4134"9.5875null"S"
32703"Nysveen, Mr. Johan Hansen""male"61.000"345364"6.2375null"S"
" ], "text/plain": [ "shape: (5, 12)\n", "┌─────────────┬──────────┬────────┬───────────────────────────┬───┬────────┬────────┬───────┬──────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare ┆ Cabin ┆ Embarked │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 ┆ str ┆ ┆ str ┆ f64 ┆ str ┆ str │\n", "╞═════════════╪══════════╪════════╪═══════════════════════════╪═══╪════════╪════════╪═══════╪══════════╡\n", "│ 852 ┆ 0 ┆ 3 ┆ Svensson, Mr. Johan ┆ … ┆ 347060 ┆ 7.775 ┆ null ┆ S │\n", "│ 117 ┆ 0 ┆ 3 ┆ Connors, Mr. Patrick ┆ … ┆ 370369 ┆ 7.75 ┆ null ┆ Q │\n", "│ 281 ┆ 0 ┆ 3 ┆ Duane, Mr. Frank ┆ … ┆ 336439 ┆ 7.75 ┆ null ┆ Q │\n", "│ 484 ┆ 1 ┆ 3 ┆ Turkula, Mrs. (Hedwig) ┆ … ┆ 4134 ┆ 9.5875 ┆ null ┆ S │\n", "│ 327 ┆ 0 ┆ 3 ┆ Nysveen, Mr. Johan Hansen ┆ … ┆ 345364 ┆ 6.2375 ┆ null ┆ S │\n", "└─────────────┴──────────┴────────┴───────────────────────────┴───┴────────┴────────┴───────┴──────────┘" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.sort('Pclass', 'Age', descending=True, nulls_last=True).head()" ] }, { "cell_type": "markdown", "id": "33927c23-55a1-46a6-89fa-6876748a2ef2", "metadata": {}, "source": [ "### Long to wide table format" ] }, { "cell_type": "code", "execution_count": 100, "id": "6d1de285-f3c4-4e68-8fa2-622bb521c0c2", "metadata": {}, "outputs": [], "source": [ "no2 = air_quality.filter(pl.col('parameter') == 'no2')" ] }, { "cell_type": "code", "execution_count": 101, "id": "9ddea593-81a3-4e55-9a15-fc4900f75932", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (6, 7)
locationcitycountrydate.utcparametervalueunit
strstrstrdatetime[μs, UTC]strf64str
"BETR801""Antwerpen""BE"2019-04-09 01:00:00 UTC"no2"22.5"µg/m³"
"BETR801""Antwerpen""BE"2019-04-09 02:00:00 UTC"no2"53.5"µg/m³"
"FR04014""Paris""FR"2019-04-09 01:00:00 UTC"no2"24.4"µg/m³"
"FR04014""Paris""FR"2019-04-09 02:00:00 UTC"no2"27.4"µg/m³"
"London Westminster""London""GB"2019-04-09 02:00:00 UTC"no2"67.0"µg/m³"
"London Westminster""London""GB"2019-04-09 03:00:00 UTC"no2"67.0"µg/m³"
" ], "text/plain": [ "shape: (6, 7)\n", "┌────────────────────┬───────────┬─────────┬─────────────────────────┬───────────┬───────┬───────┐\n", "│ location ┆ city ┆ country ┆ date.utc ┆ parameter ┆ value ┆ unit │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ datetime[μs, UTC] ┆ str ┆ f64 ┆ str │\n", "╞════════════════════╪═══════════╪═════════╪═════════════════════════╪═══════════╪═══════╪═══════╡\n", "│ BETR801 ┆ Antwerpen ┆ BE ┆ 2019-04-09 01:00:00 UTC ┆ no2 ┆ 22.5 ┆ µg/m³ │\n", "│ BETR801 ┆ Antwerpen ┆ BE ┆ 2019-04-09 02:00:00 UTC ┆ no2 ┆ 53.5 ┆ µg/m³ │\n", "│ FR04014 ┆ Paris ┆ FR ┆ 2019-04-09 01:00:00 UTC ┆ no2 ┆ 24.4 ┆ µg/m³ │\n", "│ FR04014 ┆ Paris ┆ FR ┆ 2019-04-09 02:00:00 UTC ┆ no2 ┆ 27.4 ┆ µg/m³ │\n", "│ London Westminster ┆ London ┆ GB ┆ 2019-04-09 02:00:00 UTC ┆ no2 ┆ 67.0 ┆ µg/m³ │\n", "│ London Westminster ┆ London ┆ GB ┆ 2019-04-09 03:00:00 UTC ┆ no2 ┆ 67.0 ┆ µg/m³ │\n", "└────────────────────┴───────────┴─────────┴─────────────────────────┴───────────┴───────┴───────┘" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# no2_subset = no2.sort_index().groupby([\"location\"]).head(2)\n", "no2_subset = (\n", "no2\n", ".sort('date.utc')\n", ".group_by('location')\n", ".agg(pl.all().head(2))\n", ".explode(pl.exclude('location'))\n", ")\n", "no2_subset" ] }, { "cell_type": "code", "execution_count": 102, "id": "606d6372-a77f-4d45-982b-3667a7c1fa66", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3, 4)
date.utcBETR801FR04014London Westminster
datetime[μs, UTC]f64f64f64
2019-04-09 01:00:00 UTC22.524.4null
2019-04-09 02:00:00 UTC53.527.467.0
2019-04-09 03:00:00 UTCnullnull67.0
" ], "text/plain": [ "shape: (3, 4)\n", "┌─────────────────────────┬─────────┬─────────┬────────────────────┐\n", "│ date.utc ┆ BETR801 ┆ FR04014 ┆ London Westminster │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs, UTC] ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════════╪═════════╪═════════╪════════════════════╡\n", "│ 2019-04-09 01:00:00 UTC ┆ 22.5 ┆ 24.4 ┆ null │\n", "│ 2019-04-09 02:00:00 UTC ┆ 53.5 ┆ 27.4 ┆ 67.0 │\n", "│ 2019-04-09 03:00:00 UTC ┆ null ┆ null ┆ 67.0 │\n", "└─────────────────────────┴─────────┴─────────┴────────────────────┘" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "no2_subset.pivot(on=\"location\", index='date.utc', values='value')" ] }, { "cell_type": "code", "execution_count": 103, "id": "c018e6f8-c0e0-497e-a103-78daaed8f79a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 7)
citycountrydate.utclocationparametervalueunit
strstrdatetime[μs, UTC]strstrf64str
"Paris""FR"2019-06-21 00:00:00 UTC"FR04014""no2"20.0"µg/m³"
"Paris""FR"2019-06-20 23:00:00 UTC"FR04014""no2"21.8"µg/m³"
"Paris""FR"2019-06-20 22:00:00 UTC"FR04014""no2"26.5"µg/m³"
"Paris""FR"2019-06-20 21:00:00 UTC"FR04014""no2"24.9"µg/m³"
"Paris""FR"2019-06-20 20:00:00 UTC"FR04014""no2"21.4"µg/m³"
" ], "text/plain": [ "shape: (5, 7)\n", "┌───────┬─────────┬─────────────────────────┬──────────┬───────────┬───────┬───────┐\n", "│ city ┆ country ┆ date.utc ┆ location ┆ parameter ┆ value ┆ unit │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ datetime[μs, UTC] ┆ str ┆ str ┆ f64 ┆ str │\n", "╞═══════╪═════════╪═════════════════════════╪══════════╪═══════════╪═══════╪═══════╡\n", "│ Paris ┆ FR ┆ 2019-06-21 00:00:00 UTC ┆ FR04014 ┆ no2 ┆ 20.0 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 23:00:00 UTC ┆ FR04014 ┆ no2 ┆ 21.8 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 22:00:00 UTC ┆ FR04014 ┆ no2 ┆ 26.5 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 21:00:00 UTC ┆ FR04014 ┆ no2 ┆ 24.9 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 20:00:00 UTC ┆ FR04014 ┆ no2 ┆ 21.4 ┆ µg/m³ │\n", "└───────┴─────────┴─────────────────────────┴──────────┴───────────┴───────┴───────┘" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "no2.head(5)" ] }, { "cell_type": "code", "execution_count": 104, "id": "3e0c12c3-9148-4063-b961-b631ca16048f", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":NdOverlay [Variable]\n", " :Curve [date.utc] (value)" ] }, "execution_count": 104, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p1847" } }, "output_type": "execute_result" } ], "source": [ "(\n", "no2\n", ".sort('date.utc')\n", ".pivot(on='location', index='date.utc', values='value')\n", ".hvplot.line(x=\"date.utc\")\n", ")" ] }, { "cell_type": "markdown", "id": "58ce5b8d-e1b7-4be6-a02a-290a49dbe55c", "metadata": {}, "source": [ "### Pivot table" ] }, { "cell_type": "code", "execution_count": 105, "id": "5f6929bc-bad7-4d6b-b9d4-423c4584ab7f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3, 3)
locationpm25no2
strf64f64
"BETR801"23.16949226.95092
"London Westminster"13.44356829.74005
"FR04014"null29.374284
" ], "text/plain": [ "shape: (3, 3)\n", "┌────────────────────┬───────────┬───────────┐\n", "│ location ┆ pm25 ┆ no2 │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 │\n", "╞════════════════════╪═══════════╪═══════════╡\n", "│ BETR801 ┆ 23.169492 ┆ 26.95092 │\n", "│ London Westminster ┆ 13.443568 ┆ 29.74005 │\n", "│ FR04014 ┆ null ┆ 29.374284 │\n", "└────────────────────┴───────────┴───────────┘" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality.pivot(\n", " on='parameter', index='location', values='value', aggregate_function='mean'\n", ")" ] }, { "cell_type": "code", "execution_count": 106, "id": "318834bb-3aab-45ff-8b92-284d5e40a5d4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3, 3)
locationpm25no2
strf64f64
"BETR801"23.16949226.95092
"London Westminster"13.44356829.74005
"FR04014"null29.374284
" ], "text/plain": [ "shape: (3, 3)\n", "┌────────────────────┬───────────┬───────────┐\n", "│ location ┆ pm25 ┆ no2 │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 │\n", "╞════════════════════╪═══════════╪═══════════╡\n", "│ BETR801 ┆ 23.169492 ┆ 26.95092 │\n", "│ London Westminster ┆ 13.443568 ┆ 29.74005 │\n", "│ FR04014 ┆ null ┆ 29.374284 │\n", "└────────────────────┴───────────┴───────────┘" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality.pivot(\n", " on='parameter', index='location', values='value', aggregate_function='mean'\n", ")" ] }, { "cell_type": "code", "execution_count": 107, "id": "677eb0da-2fa5-499d-b094-d00e7af07a39", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (4, 4)
locationno2pm25All
strf64f64f64
"FR04014"29.374284null29.374284
"BETR801"26.9509223.16949224.982353
"London Westminster"29.7400513.44356821.491708
"All"29.43031614.38684924.222743
" ], "text/plain": [ "shape: (4, 4)\n", "┌────────────────────┬───────────┬───────────┬───────────┐\n", "│ location ┆ no2 ┆ pm25 ┆ All │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞════════════════════╪═══════════╪═══════════╪═══════════╡\n", "│ FR04014 ┆ 29.374284 ┆ null ┆ 29.374284 │\n", "│ BETR801 ┆ 26.95092 ┆ 23.169492 ┆ 24.982353 │\n", "│ London Westminster ┆ 29.74005 ┆ 13.443568 ┆ 21.491708 │\n", "│ All ┆ 29.430316 ┆ 14.386849 ┆ 24.222743 │\n", "└────────────────────┴───────────┴───────────┴───────────┘" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# air_quality.pivot_table(\n", "# values=\"value\",\n", "# index=\"location\",\n", "# columns=\"parameter\",\n", "# aggfunc=\"mean\",\n", "# margins=True,\n", "# )\n", "\n", "def pivot_with_margins(df, on, index, expr):\n", " all_on = pl.lit('All').alias(on)\n", " all_index = pl.lit('All').alias(index)\n", "\n", " return (\n", " pl.concat([\n", " df.group_by(index, on).agg(expr),\n", " df.group_by(index).agg(all_on, expr),\n", " df.group_by(on).agg(all_index, expr),\n", " df.select(all_on, all_index, expr),\n", " ], how='diagonal')\n", " .pivot(on=on, index=index)\n", " )\n", "\n", "pivot_with_margins(\n", " air_quality, \n", " on='parameter', \n", " index='location', \n", " expr=pl.col('value').mean()\n", ")" ] }, { "cell_type": "markdown", "id": "5a5545fa-1e3a-4175-8a65-eea44748698d", "metadata": {}, "source": [ "### Wide to long format" ] }, { "cell_type": "code", "execution_count": 109, "id": "4b63fc81-d613-48b3-b71e-4319642b93b8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 4)
date.utcFR04014BETR801London Westminster
datetime[μs, UTC]f64f64f64
2019-04-09 01:00:00 UTC24.422.5null
2019-04-09 02:00:00 UTC27.453.567.0
2019-04-09 03:00:00 UTC34.254.567.0
2019-04-09 04:00:00 UTC48.534.541.0
2019-04-09 05:00:00 UTC59.546.541.0
" ], "text/plain": [ "shape: (5, 4)\n", "┌─────────────────────────┬─────────┬─────────┬────────────────────┐\n", "│ date.utc ┆ FR04014 ┆ BETR801 ┆ London Westminster │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs, UTC] ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════════╪═════════╪═════════╪════════════════════╡\n", "│ 2019-04-09 01:00:00 UTC ┆ 24.4 ┆ 22.5 ┆ null │\n", "│ 2019-04-09 02:00:00 UTC ┆ 27.4 ┆ 53.5 ┆ 67.0 │\n", "│ 2019-04-09 03:00:00 UTC ┆ 34.2 ┆ 54.5 ┆ 67.0 │\n", "│ 2019-04-09 04:00:00 UTC ┆ 48.5 ┆ 34.5 ┆ 41.0 │\n", "│ 2019-04-09 05:00:00 UTC ┆ 59.5 ┆ 46.5 ┆ 41.0 │\n", "└─────────────────────────┴─────────┴─────────┴────────────────────┘" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "no2_pivoted = no2.pivot(on='location', index=\"date.utc\", values=\"value\").sort('date.utc')\n", "no2_pivoted.head(5)" ] }, { "cell_type": "code", "execution_count": 110, "id": "450c62fc-e43a-4289-803f-b4e1f22ddcc1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 3)
date.utclocationvalue
datetime[μs, UTC]strf64
2019-04-09 01:00:00 UTC"FR04014"24.4
2019-04-09 02:00:00 UTC"FR04014"27.4
2019-04-09 03:00:00 UTC"FR04014"34.2
2019-04-09 04:00:00 UTC"FR04014"48.5
2019-04-09 05:00:00 UTC"FR04014"59.5
" ], "text/plain": [ "shape: (5, 3)\n", "┌─────────────────────────┬──────────┬───────┐\n", "│ date.utc ┆ location ┆ value │\n", "│ --- ┆ --- ┆ --- │\n", "│ datetime[μs, UTC] ┆ str ┆ f64 │\n", "╞═════════════════════════╪══════════╪═══════╡\n", "│ 2019-04-09 01:00:00 UTC ┆ FR04014 ┆ 24.4 │\n", "│ 2019-04-09 02:00:00 UTC ┆ FR04014 ┆ 27.4 │\n", "│ 2019-04-09 03:00:00 UTC ┆ FR04014 ┆ 34.2 │\n", "│ 2019-04-09 04:00:00 UTC ┆ FR04014 ┆ 48.5 │\n", "│ 2019-04-09 05:00:00 UTC ┆ FR04014 ┆ 59.5 │\n", "└─────────────────────────┴──────────┴───────┘" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# no_2 = no2_pivoted.melt(id_vars=\"date.utc\")\n", "no_2 = no2_pivoted.unpivot(index='date.utc', variable_name='location')\n", "no_2.head(5)" ] }, { "cell_type": "code", "execution_count": 111, "id": "6e67f44d-6264-4212-bf9f-9cd6b2a04245", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 3)
date.utcid_locationNO_2
datetime[μs, UTC]strf64
2019-04-09 01:00:00 UTC"BETR801"22.5
2019-04-09 02:00:00 UTC"BETR801"53.5
2019-04-09 03:00:00 UTC"BETR801"54.5
2019-04-09 04:00:00 UTC"BETR801"34.5
2019-04-09 05:00:00 UTC"BETR801"46.5
" ], "text/plain": [ "shape: (5, 3)\n", "┌─────────────────────────┬─────────────┬──────┐\n", "│ date.utc ┆ id_location ┆ NO_2 │\n", "│ --- ┆ --- ┆ --- │\n", "│ datetime[μs, UTC] ┆ str ┆ f64 │\n", "╞═════════════════════════╪═════════════╪══════╡\n", "│ 2019-04-09 01:00:00 UTC ┆ BETR801 ┆ 22.5 │\n", "│ 2019-04-09 02:00:00 UTC ┆ BETR801 ┆ 53.5 │\n", "│ 2019-04-09 03:00:00 UTC ┆ BETR801 ┆ 54.5 │\n", "│ 2019-04-09 04:00:00 UTC ┆ BETR801 ┆ 34.5 │\n", "│ 2019-04-09 05:00:00 UTC ┆ BETR801 ┆ 46.5 │\n", "└─────────────────────────┴─────────────┴──────┘" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "no_2 = no2_pivoted.unpivot(\n", " on=[\"BETR801\", \"FR04014\", \"London Westminster\"],\n", " index='date.utc',\n", " variable_name='id_location',\n", " value_name='NO_2'\n", ")\n", "no_2.head(5)" ] }, { "cell_type": "markdown", "id": "56b9b954-cc35-44fa-ae27-ce9eeab9e008", "metadata": {}, "source": [ "## How to combine data from multiple tables" ] }, { "cell_type": "code", "execution_count": 112, "id": "4b521629-c196-4ab0-b04c-9e65d42c5f53", "metadata": {}, "outputs": [], "source": [ "air_quality_no2 = pl.read_csv(\"data/air_quality_no2_long.csv\", try_parse_dates=True)\n", "air_quality_no2 = air_quality_no2.select(\"date.utc\", \"location\", \"parameter\", \"value\")\n", "\n", "air_quality_pm25 = pl.read_csv(\"data/air_quality_pm25_long.csv\", try_parse_dates=True)\n", "air_quality_pm25 = air_quality_pm25.select(\"date.utc\", \"location\", \"parameter\", \"value\")" ] }, { "cell_type": "markdown", "id": "08fe5db9-cd4d-4d2c-bf65-5751aa3ced9d", "metadata": {}, "source": [ "### Concatenating objects" ] }, { "cell_type": "code", "execution_count": 113, "id": "86548b1f-f18c-4f81-a09a-37973d54004a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3_178, 4)
date.utclocationparametervalue
datetime[μs, UTC]strstrf64
2019-06-18 06:00:00 UTC"BETR801""pm25"18.0
2019-06-17 08:00:00 UTC"BETR801""pm25"6.5
2019-06-17 07:00:00 UTC"BETR801""pm25"18.5
2019-06-17 06:00:00 UTC"BETR801""pm25"16.0
2019-06-17 05:00:00 UTC"BETR801""pm25"7.5
2019-05-07 06:00:00 UTC"London Westminster""no2"26.0
2019-05-07 04:00:00 UTC"London Westminster""no2"16.0
2019-05-07 03:00:00 UTC"London Westminster""no2"19.0
2019-05-07 02:00:00 UTC"London Westminster""no2"19.0
2019-05-07 01:00:00 UTC"London Westminster""no2"23.0
" ], "text/plain": [ "shape: (3_178, 4)\n", "┌─────────────────────────┬────────────────────┬───────────┬───────┐\n", "│ date.utc ┆ location ┆ parameter ┆ value │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs, UTC] ┆ str ┆ str ┆ f64 │\n", "╞═════════════════════════╪════════════════════╪═══════════╪═══════╡\n", "│ 2019-06-18 06:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 18.0 │\n", "│ 2019-06-17 08:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 6.5 │\n", "│ 2019-06-17 07:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 18.5 │\n", "│ 2019-06-17 06:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 16.0 │\n", "│ 2019-06-17 05:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 7.5 │\n", "│ … ┆ … ┆ … ┆ … │\n", "│ 2019-05-07 06:00:00 UTC ┆ London Westminster ┆ no2 ┆ 26.0 │\n", "│ 2019-05-07 04:00:00 UTC ┆ London Westminster ┆ no2 ┆ 16.0 │\n", "│ 2019-05-07 03:00:00 UTC ┆ London Westminster ┆ no2 ┆ 19.0 │\n", "│ 2019-05-07 02:00:00 UTC ┆ London Westminster ┆ no2 ┆ 19.0 │\n", "│ 2019-05-07 01:00:00 UTC ┆ London Westminster ┆ no2 ┆ 23.0 │\n", "└─────────────────────────┴────────────────────┴───────────┴───────┘" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality = pl.concat([air_quality_pm25, air_quality_no2], how='vertical')\n", "air_quality" ] }, { "cell_type": "code", "execution_count": 114, "id": "328ca826-45d6-4681-9c21-97ba99f57df8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape of the ``air_quality_pm25`` table: (1110, 4)\n", "Shape of the ``air_quality_no2`` table: (2068, 4)\n", "Shape of the resulting ``air_quality`` table: (3178, 4)\n" ] } ], "source": [ "print('Shape of the ``air_quality_pm25`` table: ', air_quality_pm25.shape)\n", "print('Shape of the ``air_quality_no2`` table: ', air_quality_no2.shape)\n", "print('Shape of the resulting ``air_quality`` table: ', air_quality.shape)" ] }, { "cell_type": "code", "execution_count": 115, "id": "5f619105-df56-49b3-b5e3-f1b60d899d08", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 4)
date.utclocationparametervalue
datetime[μs, UTC]strstrf64
2019-05-07 01:00:00 UTC"BETR801""pm25"12.5
2019-05-07 01:00:00 UTC"London Westminster""pm25"8.0
2019-05-07 01:00:00 UTC"FR04014""no2"25.0
2019-05-07 01:00:00 UTC"BETR801""no2"50.5
2019-05-07 01:00:00 UTC"London Westminster""no2"23.0
" ], "text/plain": [ "shape: (5, 4)\n", "┌─────────────────────────┬────────────────────┬───────────┬───────┐\n", "│ date.utc ┆ location ┆ parameter ┆ value │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs, UTC] ┆ str ┆ str ┆ f64 │\n", "╞═════════════════════════╪════════════════════╪═══════════╪═══════╡\n", "│ 2019-05-07 01:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 12.5 │\n", "│ 2019-05-07 01:00:00 UTC ┆ London Westminster ┆ pm25 ┆ 8.0 │\n", "│ 2019-05-07 01:00:00 UTC ┆ FR04014 ┆ no2 ┆ 25.0 │\n", "│ 2019-05-07 01:00:00 UTC ┆ BETR801 ┆ no2 ┆ 50.5 │\n", "│ 2019-05-07 01:00:00 UTC ┆ London Westminster ┆ no2 ┆ 23.0 │\n", "└─────────────────────────┴────────────────────┴───────────┴───────┘" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality = air_quality.sort(\"date.utc\")\n", "air_quality.head()" ] }, { "cell_type": "code", "execution_count": 116, "id": "fdc2a980-7704-458a-a289-4198918dac12", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3_178, 5)
keydate.utclocationparametervalue
strdatetime[μs, UTC]strstrf64
"PM25"2019-06-18 06:00:00 UTC"BETR801""pm25"18.0
"PM25"2019-06-17 08:00:00 UTC"BETR801""pm25"6.5
"PM25"2019-06-17 07:00:00 UTC"BETR801""pm25"18.5
"PM25"2019-06-17 06:00:00 UTC"BETR801""pm25"16.0
"PM25"2019-06-17 05:00:00 UTC"BETR801""pm25"7.5
"NO2"2019-05-07 06:00:00 UTC"London Westminster""no2"26.0
"NO2"2019-05-07 04:00:00 UTC"London Westminster""no2"16.0
"NO2"2019-05-07 03:00:00 UTC"London Westminster""no2"19.0
"NO2"2019-05-07 02:00:00 UTC"London Westminster""no2"19.0
"NO2"2019-05-07 01:00:00 UTC"London Westminster""no2"23.0
" ], "text/plain": [ "shape: (3_178, 5)\n", "┌──────┬─────────────────────────┬────────────────────┬───────────┬───────┐\n", "│ key ┆ date.utc ┆ location ┆ parameter ┆ value │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ datetime[μs, UTC] ┆ str ┆ str ┆ f64 │\n", "╞══════╪═════════════════════════╪════════════════════╪═══════════╪═══════╡\n", "│ PM25 ┆ 2019-06-18 06:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 18.0 │\n", "│ PM25 ┆ 2019-06-17 08:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 6.5 │\n", "│ PM25 ┆ 2019-06-17 07:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 18.5 │\n", "│ PM25 ┆ 2019-06-17 06:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 16.0 │\n", "│ PM25 ┆ 2019-06-17 05:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 7.5 │\n", "│ … ┆ … ┆ … ┆ … ┆ … │\n", "│ NO2 ┆ 2019-05-07 06:00:00 UTC ┆ London Westminster ┆ no2 ┆ 26.0 │\n", "│ NO2 ┆ 2019-05-07 04:00:00 UTC ┆ London Westminster ┆ no2 ┆ 16.0 │\n", "│ NO2 ┆ 2019-05-07 03:00:00 UTC ┆ London Westminster ┆ no2 ┆ 19.0 │\n", "│ NO2 ┆ 2019-05-07 02:00:00 UTC ┆ London Westminster ┆ no2 ┆ 19.0 │\n", "│ NO2 ┆ 2019-05-07 01:00:00 UTC ┆ London Westminster ┆ no2 ┆ 23.0 │\n", "└──────┴─────────────────────────┴────────────────────┴───────────┴───────┘" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], keys=[\"PM25\", \"NO2\"])\n", "air_quality_ = pl.concat([\n", " air_quality_pm25.with_columns(pl.lit('PM25').alias('key')),\n", " air_quality_no2.with_columns(pl.lit('NO2').alias('key'))\n", "]).select('key', pl.exclude('key'))\n", "air_quality_" ] }, { "cell_type": "code", "execution_count": 117, "id": "a316cc44-caa9-4981-bd6c-6ef0ea16025f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3_178, 5)
keydate.utclocationparametervalue
strdatetime[μs, UTC]strstrf64
"PM25"2019-06-18 06:00:00 UTC"BETR801""pm25"18.0
"PM25"2019-06-17 08:00:00 UTC"BETR801""pm25"6.5
"PM25"2019-06-17 07:00:00 UTC"BETR801""pm25"18.5
"PM25"2019-06-17 06:00:00 UTC"BETR801""pm25"16.0
"PM25"2019-06-17 05:00:00 UTC"BETR801""pm25"7.5
"NO2"2019-05-07 06:00:00 UTC"London Westminster""no2"26.0
"NO2"2019-05-07 04:00:00 UTC"London Westminster""no2"16.0
"NO2"2019-05-07 03:00:00 UTC"London Westminster""no2"19.0
"NO2"2019-05-07 02:00:00 UTC"London Westminster""no2"19.0
"NO2"2019-05-07 01:00:00 UTC"London Westminster""no2"23.0
" ], "text/plain": [ "shape: (3_178, 5)\n", "┌──────┬─────────────────────────┬────────────────────┬───────────┬───────┐\n", "│ key ┆ date.utc ┆ location ┆ parameter ┆ value │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ datetime[μs, UTC] ┆ str ┆ str ┆ f64 │\n", "╞══════╪═════════════════════════╪════════════════════╪═══════════╪═══════╡\n", "│ PM25 ┆ 2019-06-18 06:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 18.0 │\n", "│ PM25 ┆ 2019-06-17 08:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 6.5 │\n", "│ PM25 ┆ 2019-06-17 07:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 18.5 │\n", "│ PM25 ┆ 2019-06-17 06:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 16.0 │\n", "│ PM25 ┆ 2019-06-17 05:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 7.5 │\n", "│ … ┆ … ┆ … ┆ … ┆ … │\n", "│ NO2 ┆ 2019-05-07 06:00:00 UTC ┆ London Westminster ┆ no2 ┆ 26.0 │\n", "│ NO2 ┆ 2019-05-07 04:00:00 UTC ┆ London Westminster ┆ no2 ┆ 16.0 │\n", "│ NO2 ┆ 2019-05-07 03:00:00 UTC ┆ London Westminster ┆ no2 ┆ 19.0 │\n", "│ NO2 ┆ 2019-05-07 02:00:00 UTC ┆ London Westminster ┆ no2 ┆ 19.0 │\n", "│ NO2 ┆ 2019-05-07 01:00:00 UTC ┆ London Westminster ┆ no2 ┆ 23.0 │\n", "└──────┴─────────────────────────┴────────────────────┴───────────┴───────┘" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def concat_with_keys(dfs, keys, key_column_name='key'):\n", " dfs = [df.with_columns(pl.lit(key).alias(key_column_name)) \n", " for df, key in zip(dfs, keys)]\n", " return pl.concat(dfs, how='vertical').select(key_column_name, pl.exclude(key_column_name))\n", "\n", "air_quality_ = concat_with_keys([air_quality_pm25, air_quality_no2], keys=[\"PM25\", \"NO2\"])\n", "air_quality_" ] }, { "cell_type": "markdown", "id": "45f64268-0048-4285-a9b0-89f9ab26bf6a", "metadata": {}, "source": [ "### Join tables using a common identifier" ] }, { "cell_type": "code", "execution_count": 118, "id": "24763498-6059-4d1a-9589-1bb48d461364", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 3)
locationcoordinates.latitudecoordinates.longitude
strf64f64
"BELAL01"51.236194.38522
"BELHB23"51.17034.341
"BELLD01"51.109985.00486
"BELLD02"51.120385.02155
"BELR833"51.327664.36226
" ], "text/plain": [ "shape: (5, 3)\n", "┌──────────┬──────────────────────┬───────────────────────┐\n", "│ location ┆ coordinates.latitude ┆ coordinates.longitude │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 │\n", "╞══════════╪══════════════════════╪═══════════════════════╡\n", "│ BELAL01 ┆ 51.23619 ┆ 4.38522 │\n", "│ BELHB23 ┆ 51.1703 ┆ 4.341 │\n", "│ BELLD01 ┆ 51.10998 ┆ 5.00486 │\n", "│ BELLD02 ┆ 51.12038 ┆ 5.02155 │\n", "│ BELR833 ┆ 51.32766 ┆ 4.36226 │\n", "└──────────┴──────────────────────┴───────────────────────┘" ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stations_coord = pl.read_csv(\"data/air_quality_stations.csv\")\n", "stations_coord.head(5)" ] }, { "cell_type": "code", "execution_count": 119, "id": "9c2c1907-891d-4b54-8776-0d64bdbcf7b7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 6)
date.utclocationparametervaluecoordinates.latitudecoordinates.longitude
datetime[μs, UTC]strstrf64f64f64
2019-05-07 01:00:00 UTC"BETR801""pm25"12.551.209664.43182
2019-05-07 01:00:00 UTC"London Westminster""pm25"8.051.49467-0.13193
2019-05-07 01:00:00 UTC"FR04014""no2"25.048.837242.3939
2019-05-07 01:00:00 UTC"FR04014""no2"25.048.837222.3939
2019-05-07 01:00:00 UTC"BETR801""no2"50.551.209664.43182
" ], "text/plain": [ "shape: (5, 6)\n", "┌─────────────────────────┬────────────────────┬───────────┬───────┬──────────────────────┬───────────────────────┐\n", "│ date.utc ┆ location ┆ parameter ┆ value ┆ coordinates.latitude ┆ coordinates.longitude │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs, UTC] ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════════╪════════════════════╪═══════════╪═══════╪══════════════════════╪═══════════════════════╡\n", "│ 2019-05-07 01:00:00 UTC ┆ BETR801 ┆ pm25 ┆ 12.5 ┆ 51.20966 ┆ 4.43182 │\n", "│ 2019-05-07 01:00:00 UTC ┆ London Westminster ┆ pm25 ┆ 8.0 ┆ 51.49467 ┆ -0.13193 │\n", "│ 2019-05-07 01:00:00 UTC ┆ FR04014 ┆ no2 ┆ 25.0 ┆ 48.83724 ┆ 2.3939 │\n", "│ 2019-05-07 01:00:00 UTC ┆ FR04014 ┆ no2 ┆ 25.0 ┆ 48.83722 ┆ 2.3939 │\n", "│ 2019-05-07 01:00:00 UTC ┆ BETR801 ┆ no2 ┆ 50.5 ┆ 51.20966 ┆ 4.43182 │\n", "└─────────────────────────┴────────────────────┴───────────┴───────┴──────────────────────┴───────────────────────┘" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality = air_quality.join(stations_coord, on='location', how='left')\n", "air_quality.head(5)" ] }, { "cell_type": "code", "execution_count": 120, "id": "fb3bba02-2c64-40d6-80f1-15cebe0eac25", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 3)
iddescriptionname
strstrstr
"bc""Black Carbon""BC"
"co""Carbon Monoxide""CO"
"no2""Nitrogen Dioxide""NO2"
"o3""Ozone""O3"
"pm10""Particulate matter less than 1…"PM10"
" ], "text/plain": [ "shape: (5, 3)\n", "┌──────┬─────────────────────────────────┬──────┐\n", "│ id ┆ description ┆ name │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str │\n", "╞══════╪═════════════════════════════════╪══════╡\n", "│ bc ┆ Black Carbon ┆ BC │\n", "│ co ┆ Carbon Monoxide ┆ CO │\n", "│ no2 ┆ Nitrogen Dioxide ┆ NO2 │\n", "│ o3 ┆ Ozone ┆ O3 │\n", "│ pm10 ┆ Particulate matter less than 1… ┆ PM10 │\n", "└──────┴─────────────────────────────────┴──────┘" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality_parameters = pl.read_csv(\"data/air_quality_parameters.csv\")\n", "air_quality_parameters.head(5)" ] }, { "cell_type": "code", "execution_count": 121, "id": "7f6dc60d-97a8-4f1f-ae1e-2bb6c6cdb241", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 8)
date.utclocationparametervaluecoordinates.latitudecoordinates.longitudedescriptionname
datetime[μs, UTC]strstrf64f64f64strstr
2019-05-07 01:00:00 UTC"BETR801""pm25"12.551.209664.43182"Particulate matter less than 2…"PM2.5"
2019-05-07 01:00:00 UTC"London Westminster""pm25"8.051.49467-0.13193"Particulate matter less than 2…"PM2.5"
2019-05-07 01:00:00 UTC"FR04014""no2"25.048.837242.3939"Nitrogen Dioxide""NO2"
2019-05-07 01:00:00 UTC"FR04014""no2"25.048.837222.3939"Nitrogen Dioxide""NO2"
2019-05-07 01:00:00 UTC"BETR801""no2"50.551.209664.43182"Nitrogen Dioxide""NO2"
" ], "text/plain": [ "shape: (5, 8)\n", "┌──────────────┬─────────────┬───────────┬───────┬────────────────────┬────────────────────┬───────────────────┬───────┐\n", "│ date.utc ┆ location ┆ parameter ┆ value ┆ coordinates.latitu ┆ coordinates.longit ┆ description ┆ name │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ de ┆ ude ┆ --- ┆ --- │\n", "│ datetime[μs, ┆ str ┆ str ┆ f64 ┆ --- ┆ --- ┆ str ┆ str │\n", "│ UTC] ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ ┆ │\n", "╞══════════════╪═════════════╪═══════════╪═══════╪════════════════════╪════════════════════╪═══════════════════╪═══════╡\n", "│ 2019-05-07 ┆ BETR801 ┆ pm25 ┆ 12.5 ┆ 51.20966 ┆ 4.43182 ┆ Particulate ┆ PM2.5 │\n", "│ 01:00:00 UTC ┆ ┆ ┆ ┆ ┆ ┆ matter less than ┆ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ 2… ┆ │\n", "│ 2019-05-07 ┆ London ┆ pm25 ┆ 8.0 ┆ 51.49467 ┆ -0.13193 ┆ Particulate ┆ PM2.5 │\n", "│ 01:00:00 UTC ┆ Westminster ┆ ┆ ┆ ┆ ┆ matter less than ┆ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ 2… ┆ │\n", "│ 2019-05-07 ┆ FR04014 ┆ no2 ┆ 25.0 ┆ 48.83724 ┆ 2.3939 ┆ Nitrogen Dioxide ┆ NO2 │\n", "│ 01:00:00 UTC ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ 2019-05-07 ┆ FR04014 ┆ no2 ┆ 25.0 ┆ 48.83722 ┆ 2.3939 ┆ Nitrogen Dioxide ┆ NO2 │\n", "│ 01:00:00 UTC ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ 2019-05-07 ┆ BETR801 ┆ no2 ┆ 50.5 ┆ 51.20966 ┆ 4.43182 ┆ Nitrogen Dioxide ┆ NO2 │\n", "│ 01:00:00 UTC ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "└──────────────┴─────────────┴───────────┴───────┴────────────────────┴────────────────────┴───────────────────┴───────┘" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# air_quality = pd.merge(air_quality, stations_coord, how=\"left\", on=\"location\")\n", "air_quality = air_quality.join(air_quality_parameters, left_on='parameter', right_on='id')\n", "air_quality.head()" ] }, { "cell_type": "markdown", "id": "a2b3ca0b-1389-4da8-84c5-c1f15c120888", "metadata": {}, "source": [ "## How to handle time series data with ease" ] }, { "cell_type": "code", "execution_count": 122, "id": "8f85fa7e-ada6-4faa-8f2b-20684688dfb2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 7)
citycountrydatetimelocationparametervalueunit
strstrstrstrstrf64str
"Paris""FR""2019-06-21 00:00:00+00:00""FR04014""no2"20.0"µg/m³"
"Paris""FR""2019-06-20 23:00:00+00:00""FR04014""no2"21.8"µg/m³"
"Paris""FR""2019-06-20 22:00:00+00:00""FR04014""no2"26.5"µg/m³"
"Paris""FR""2019-06-20 21:00:00+00:00""FR04014""no2"24.9"µg/m³"
"Paris""FR""2019-06-20 20:00:00+00:00""FR04014""no2"21.4"µg/m³"
" ], "text/plain": [ "shape: (5, 7)\n", "┌───────┬─────────┬───────────────────────────┬──────────┬───────────┬───────┬───────┐\n", "│ city ┆ country ┆ datetime ┆ location ┆ parameter ┆ value ┆ unit │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ str ┆ str ┆ f64 ┆ str │\n", "╞═══════╪═════════╪═══════════════════════════╪══════════╪═══════════╪═══════╪═══════╡\n", "│ Paris ┆ FR ┆ 2019-06-21 00:00:00+00:00 ┆ FR04014 ┆ no2 ┆ 20.0 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 23:00:00+00:00 ┆ FR04014 ┆ no2 ┆ 21.8 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 22:00:00+00:00 ┆ FR04014 ┆ no2 ┆ 26.5 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 21:00:00+00:00 ┆ FR04014 ┆ no2 ┆ 24.9 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 20:00:00+00:00 ┆ FR04014 ┆ no2 ┆ 21.4 ┆ µg/m³ │\n", "└───────┴─────────┴───────────────────────────┴──────────┴───────────┴───────┴───────┘" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_quality = pl.read_csv(\"data/air_quality_no2_long.csv\")\n", "air_quality = air_quality.rename({\"date.utc\": \"datetime\"})\n", "air_quality.head()" ] }, { "cell_type": "code", "execution_count": 123, "id": "5935dabf-b736-43cc-bd4c-ed069fd037ba", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (3,)
city
str
"Antwerpen"
"London"
"Paris"
\n", "shape: (3, 1)
city
str
"Antwerpen"
"London"
"Paris"
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "row(\n", " air_quality['city'].unique(),\n", " air_quality.select(pl.col('city').unique())\n", ")" ] }, { "cell_type": "markdown", "id": "fa3481c9-fc27-45cf-9d26-89908eac9d4c", "metadata": {}, "source": [ "### Using pandas datetime properties" ] }, { "cell_type": "code", "execution_count": 124, "id": "40f5edca-bc3e-4f55-ac86-b5fb4e0920ab", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (2_068,)
datetime
datetime[μs, UTC]
2019-06-21 00:00:00 UTC
2019-06-20 23:00:00 UTC
2019-06-20 22:00:00 UTC
2019-06-20 21:00:00 UTC
2019-06-20 20:00:00 UTC
2019-05-07 06:00:00 UTC
2019-05-07 04:00:00 UTC
2019-05-07 03:00:00 UTC
2019-05-07 02:00:00 UTC
2019-05-07 01:00:00 UTC
" ], "text/plain": [ "shape: (2_068,)\n", "Series: 'datetime' [datetime[μs, UTC]]\n", "[\n", "\t2019-06-21 00:00:00 UTC\n", "\t2019-06-20 23:00:00 UTC\n", "\t2019-06-20 22:00:00 UTC\n", "\t2019-06-20 21:00:00 UTC\n", "\t2019-06-20 20:00:00 UTC\n", "\t…\n", "\t2019-05-07 06:00:00 UTC\n", "\t2019-05-07 04:00:00 UTC\n", "\t2019-05-07 03:00:00 UTC\n", "\t2019-05-07 02:00:00 UTC\n", "\t2019-05-07 01:00:00 UTC\n", "]" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# air_quality[\"datetime\"] = pd.to_datetime(air_quality[\"datetime\"])\n", "air_quality = air_quality.with_columns(\n", " pl.col('datetime').str.to_datetime()\n", ")\n", "air_quality['datetime']" ] }, { "cell_type": "code", "execution_count": 125, "id": "112217a5-b2f3-49e0-894e-04a27c047b2e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(datetime.datetime(2019, 5, 7, 1, 0, tzinfo=zoneinfo.ZoneInfo(key='UTC')),\n", " datetime.datetime(2019, 6, 21, 0, 0, tzinfo=zoneinfo.ZoneInfo(key='UTC')))" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# air_quality[\"datetime\"].min(), air_quality[\"datetime\"].max()\n", "air_quality.select(\n", " min=pl.col('datetime').min(),\n", " max=pl.col('datetime').max()\n", ").row(0)" ] }, { "cell_type": "code", "execution_count": 126, "id": "1f0003b3-12de-4aea-978e-0aac5fd712b8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "datetime.timedelta(days=44, seconds=82800)" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# air_quality[\"datetime\"].max() - air_quality[\"datetime\"].min()\n", "air_quality.select(\n", " span=pl.col('datetime').max() - pl.col('datetime').min()\n", ").item()" ] }, { "cell_type": "code", "execution_count": 127, "id": "4a55a3ff-2d69-43cd-9823-4535bc5dacf1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 7)
citycountrydatetimelocationparametervalueunit
strstrdatetime[μs, UTC]strstrf64str
"Paris""FR"2019-06-21 00:00:00 UTC"FR04014""no2"20.0"µg/m³"
"Paris""FR"2019-06-20 23:00:00 UTC"FR04014""no2"21.8"µg/m³"
"Paris""FR"2019-06-20 22:00:00 UTC"FR04014""no2"26.5"µg/m³"
"Paris""FR"2019-06-20 21:00:00 UTC"FR04014""no2"24.9"µg/m³"
"Paris""FR"2019-06-20 20:00:00 UTC"FR04014""no2"21.4"µg/m³"
" ], "text/plain": [ "shape: (5, 7)\n", "┌───────┬─────────┬─────────────────────────┬──────────┬───────────┬───────┬───────┐\n", "│ city ┆ country ┆ datetime ┆ location ┆ parameter ┆ value ┆ unit │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ datetime[μs, UTC] ┆ str ┆ str ┆ f64 ┆ str │\n", "╞═══════╪═════════╪═════════════════════════╪══════════╪═══════════╪═══════╪═══════╡\n", "│ Paris ┆ FR ┆ 2019-06-21 00:00:00 UTC ┆ FR04014 ┆ no2 ┆ 20.0 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 23:00:00 UTC ┆ FR04014 ┆ no2 ┆ 21.8 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 22:00:00 UTC ┆ FR04014 ┆ no2 ┆ 26.5 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 21:00:00 UTC ┆ FR04014 ┆ no2 ┆ 24.9 ┆ µg/m³ │\n", "│ Paris ┆ FR ┆ 2019-06-20 20:00:00 UTC ┆ FR04014 ┆ no2 ┆ 21.4 ┆ µg/m³ │\n", "└───────┴─────────┴─────────────────────────┴──────────┴───────────┴───────┴───────┘" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# air_quality[\"month\"] = air_quality[\"datetime\"].dt.month\n", "air_quality.with_columns(\n", " month=pl.col('datetime').dt.month()\n", ")\n", "air_quality.head(5)" ] }, { "cell_type": "code", "execution_count": 128, "id": "32392a50-2dc9-44ef-8434-3bb10ecb177e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (21, 3)
weekdaylocationvalue
i8strf64
1"BETR801"27.875
1"FR04014"24.85625
1"London Westminster"23.969697
2"BETR801"22.214286
2"FR04014"30.999359
6"FR04014"25.266154
6"London Westminster"24.977612
7"BETR801"21.896552
7"FR04014"23.274306
7"London Westminster"24.859155
" ], "text/plain": [ "shape: (21, 3)\n", "┌─────────┬────────────────────┬───────────┐\n", "│ weekday ┆ location ┆ value │\n", "│ --- ┆ --- ┆ --- │\n", "│ i8 ┆ str ┆ f64 │\n", "╞═════════╪════════════════════╪═══════════╡\n", "│ 1 ┆ BETR801 ┆ 27.875 │\n", "│ 1 ┆ FR04014 ┆ 24.85625 │\n", "│ 1 ┆ London Westminster ┆ 23.969697 │\n", "│ 2 ┆ BETR801 ┆ 22.214286 │\n", "│ 2 ┆ FR04014 ┆ 30.999359 │\n", "│ … ┆ … ┆ … │\n", "│ 6 ┆ FR04014 ┆ 25.266154 │\n", "│ 6 ┆ London Westminster ┆ 24.977612 │\n", "│ 7 ┆ BETR801 ┆ 21.896552 │\n", "│ 7 ┆ FR04014 ┆ 23.274306 │\n", "│ 7 ┆ London Westminster ┆ 24.859155 │\n", "└─────────┴────────────────────┴───────────┘" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# air_quality.groupby(\n", "# [air_quality[\"datetime\"].dt.weekday, \"location\"])[\"value\"].mean()\n", "(\n", "air_quality.group_by(\n", " pl.col('datetime').dt.weekday().alias('weekday'), 'location'\n", ")\n", ".agg(\n", " pl.col('value').mean()\n", ")\n", ".sort('weekday', 'location')\n", ")" ] }, { "cell_type": "code", "execution_count": 129, "id": "ede52907-1ec0-478b-b128-cd03d16c65ce", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":Bars [hour] (value)" ] }, "execution_count": 129, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p1965" } }, "output_type": "execute_result" } ], "source": [ "(\n", "air_quality\n", ".group_by(\n", " pl.col('datetime').dt.hour().alias('hour')\n", ")\n", ".agg(\n", " pl.col('value').mean()\n", ")\n", ".sort('hour')\n", ".hvplot.bar(x='hour', y='value')\n", ")" ] }, { "cell_type": "markdown", "id": "c3270ff5-6001-4874-968e-62bc544ffe14", "metadata": {}, "source": [ "### Datetime as index" ] }, { "cell_type": "code", "execution_count": 130, "id": "3438d268-ed0f-4260-90ad-3ed36c135f6d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 4)
datetimeFR04014BETR801London Westminster
datetime[μs, UTC]f64f64f64
2019-05-07 01:00:00 UTC25.050.523.0
2019-05-07 02:00:00 UTC27.745.019.0
2019-05-07 03:00:00 UTC50.4null19.0
2019-05-07 04:00:00 UTC61.9null16.0
2019-05-07 05:00:00 UTC72.4nullnull
" ], "text/plain": [ "shape: (5, 4)\n", "┌─────────────────────────┬─────────┬─────────┬────────────────────┐\n", "│ datetime ┆ FR04014 ┆ BETR801 ┆ London Westminster │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ datetime[μs, UTC] ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════════╪═════════╪═════════╪════════════════════╡\n", "│ 2019-05-07 01:00:00 UTC ┆ 25.0 ┆ 50.5 ┆ 23.0 │\n", "│ 2019-05-07 02:00:00 UTC ┆ 27.7 ┆ 45.0 ┆ 19.0 │\n", "│ 2019-05-07 03:00:00 UTC ┆ 50.4 ┆ null ┆ 19.0 │\n", "│ 2019-05-07 04:00:00 UTC ┆ 61.9 ┆ null ┆ 16.0 │\n", "│ 2019-05-07 05:00:00 UTC ┆ 72.4 ┆ null ┆ null │\n", "└─────────────────────────┴─────────┴─────────┴────────────────────┘" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "no_2 = air_quality.pivot(on='location', index='datetime', values='value').sort('datetime')\n", "no_2.head(5)" ] }, { "cell_type": "code", "execution_count": 131, "id": "5260f919-48f1-43cd-8a23-9a8b0c221703", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (1_033,)
datetime
i32
2019
2019
2019
2019
2019
2019
2019
2019
2019
2019
\n", "shape: (1_033,)
datetime
i8
2
2
2
2
2
4
4
4
4
5
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# no_2.index.year, no_2.index.weekday\n", "row(\n", " no_2['datetime'].dt.year(),\n", " no_2['datetime'].dt.weekday()\n", ")" ] }, { "cell_type": "code", "execution_count": 132, "id": "8377aa33-cbdf-4116-ac04-f2cfe44ab0ec", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":NdOverlay [Variable]\n", " :Curve [datetime] (value)" ] }, "execution_count": 132, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p2028" } }, "output_type": "execute_result" } ], "source": [ "no_2.filter(\n", " pl.col('datetime').dt.date().is_between(\n", " pl.date(2019, 5, 20), \n", " pl.date(2019, 5, 21)\n", " )\n", ").hvplot.line(x='datetime')" ] }, { "cell_type": "markdown", "id": "096a86a4-8729-42b9-9804-82acae62b2fb", "metadata": {}, "source": [ "### Resample a time series to another frequency" ] }, { "cell_type": "code", "execution_count": 133, "id": "00136eee-d7be-4c26-9c12-f441c417d195", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (2, 4)
datetimeFR04014BETR801London Westminster
datef64f64f64
2019-05-3197.074.597.0
2019-06-3084.752.552.0
" ], "text/plain": [ "shape: (2, 4)\n", "┌────────────┬─────────┬─────────┬────────────────────┐\n", "│ datetime ┆ FR04014 ┆ BETR801 ┆ London Westminster │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ date ┆ f64 ┆ f64 ┆ f64 │\n", "╞════════════╪═════════╪═════════╪════════════════════╡\n", "│ 2019-05-31 ┆ 97.0 ┆ 74.5 ┆ 97.0 │\n", "│ 2019-06-30 ┆ 84.7 ┆ 52.5 ┆ 52.0 │\n", "└────────────┴─────────┴─────────┴────────────────────┘" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# monthly_max = no_2.resample(\"ME\").max()\n", "monthly_max = (\n", "no_2.group_by(\n", " pl.col('datetime').dt.date().dt.month_end()\n", ")\n", ".agg(\n", " pl.all().max()\n", ")\n", ".sort('datetime')\n", ")\n", "monthly_max " ] }, { "cell_type": "code", "execution_count": 134, "id": "c1207c3b-1155-4d85-a6a3-e7de52444ea1", "metadata": {}, "outputs": [ { "data": {}, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.holoviews_exec.v0+json": "", "text/html": [ "
\n", "
\n", "
\n", "" ], "text/plain": [ ":Overlay\n", " .NdOverlay.I :NdOverlay [Variable]\n", " :Curve [datetime] (value)\n", " .NdOverlay.II :NdOverlay [Variable]\n", " :Scatter [datetime] (value)" ] }, "execution_count": 134, "metadata": { "application/vnd.holoviews_exec.v0+json": { "id": "p2143" } }, "output_type": "execute_result" } ], "source": [ "# no_2.resample(\"D\").mean().plot(style=\"-o\", figsize=(10, 5))\n", "df = (\n", "no_2\n", ".group_by(\n", " pl.col('datetime').dt.truncate('1d')\n", ")\n", ".agg(\n", " pl.all().mean()\n", ")\n", ")\n", "df.hvplot.line(x='datetime') * df.hvplot.scatter(x='datetime')" ] }, { "cell_type": "markdown", "id": "12e43d9e-1e3c-4e95-bcc7-69da807be3d7", "metadata": {}, "source": [ "## How to manipulate textual data" ] }, { "cell_type": "code", "execution_count": 135, "id": "3527860c-d4f1-4cc5-8aaf-4d7de6673d27", "metadata": {}, "outputs": [], "source": [ "titanic = pl.read_csv(\"data/titanic.csv\")" ] }, { "cell_type": "code", "execution_count": 136, "id": "e9ea1a0e-211b-4842-864c-03215066a9c3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (891,)
Name
str
"braund, mr. owen harris"
"cumings, mrs. john bradley (fl…
"heikkinen, miss laina"
"futrelle, mrs. jacques heath (…
"allen, mr. william henry"
"montvila, rev. juozas"
"graham, miss margaret edith"
"johnston, miss catherine helen…
"behr, mr. karl howell"
"dooley, mr. patrick"
\n", "shape: (891,)
Name
str
"braund, mr. owen harris"
"cumings, mrs. john bradley (fl…
"heikkinen, miss laina"
"futrelle, mrs. jacques heath (…
"allen, mr. william henry"
"montvila, rev. juozas"
"graham, miss margaret edith"
"johnston, miss catherine helen…
"behr, mr. karl howell"
"dooley, mr. patrick"
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# titanic[\"Name\"].str.lower()\n", "row(\n", " titanic['Name'].str.to_lowercase(),\n", " titanic.select(pl.col('Name').str.to_lowercase()).to_series()\n", ")" ] }, { "cell_type": "code", "execution_count": 137, "id": "5524d1a7-14e1-4f95-885c-eeda461128a9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (891,)
Name
list[str]
["Braund", " Mr. Owen Harris"]
["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"]
["Heikkinen", " Miss Laina"]
["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"]
["Allen", " Mr. William Henry"]
["Montvila", " Rev. Juozas"]
["Graham", " Miss Margaret Edith"]
["Johnston", " Miss Catherine Helen "Carrie""]
["Behr", " Mr. Karl Howell"]
["Dooley", " Mr. Patrick"]
\n", "shape: (891,)
Name
list[str]
["Braund", " Mr. Owen Harris"]
["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"]
["Heikkinen", " Miss Laina"]
["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"]
["Allen", " Mr. William Henry"]
["Montvila", " Rev. Juozas"]
["Graham", " Miss Margaret Edith"]
["Johnston", " Miss Catherine Helen "Carrie""]
["Behr", " Mr. Karl Howell"]
["Dooley", " Mr. Patrick"]
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# titanic[\"Name\"].str.split(\",\")\n", "row(\n", " titanic['Name'].str.split(\",\"),\n", " titanic.select(pl.col('Name').str.split(\",\")).to_series()\n", ")" ] }, { "cell_type": "code", "execution_count": 138, "id": "8eb16012-40d3-480e-abac-6d670c6ba58b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (891,)
Surname
str
"Braund"
"Cumings"
"Heikkinen"
"Futrelle"
"Allen"
"Montvila"
"Graham"
"Johnston"
"Behr"
"Dooley"
" ], "text/plain": [ "shape: (891,)\n", "Series: 'Surname' [str]\n", "[\n", "\t\"Braund\"\n", "\t\"Cumings\"\n", "\t\"Heikkinen\"\n", "\t\"Futrelle\"\n", "\t\"Allen\"\n", "\t…\n", "\t\"Montvila\"\n", "\t\"Graham\"\n", "\t\"Johnston\"\n", "\t\"Behr\"\n", "\t\"Dooley\"\n", "]" ] }, "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic[\"Surname\"] = titanic[\"Name\"].str.split(\",\").str.get(0)\n", "titanic = titanic.with_columns(\n", " Surname=pl.col('Name').str.split(\",\").list.get(0)\n", ")\n", "titanic['Surname']" ] }, { "cell_type": "code", "execution_count": 139, "id": "4cf1a58c-95e7-45e1-8d5d-361d40757268", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (891,)
Name
bool
false
false
false
false
false
false
false
false
false
false
\n", "shape: (891,)
Name
bool
false
false
false
false
false
false
false
false
false
false
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# titanic[\"Name\"].str.contains(\"Countess\")\n", "row(\n", " titanic[\"Name\"].str.contains(\"Countess\"),\n", " titanic.select(pl.col('Name').str.contains(\"Countess\")).to_series()\n", ")" ] }, { "cell_type": "code", "execution_count": 140, "id": "e705d20f-78f5-4ed0-a69d-fcec6d2cf185", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (1, 13)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedSurname
i64i64i64strstrf64i64i64strf64strstrstr
76011"Rothes, the Countess. of (Lucy…"female"33.000"110152"86.5"B77""S""Rothes"
" ], "text/plain": [ "shape: (1, 13)\n", "┌─────────────┬──────────┬────────┬─────────────────────────────────┬───┬──────┬───────┬──────────┬─────────┐\n", "│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Fare ┆ Cabin ┆ Embarked ┆ Surname │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 ┆ str ┆ ┆ f64 ┆ str ┆ str ┆ str │\n", "╞═════════════╪══════════╪════════╪═════════════════════════════════╪═══╪══════╪═══════╪══════════╪═════════╡\n", "│ 760 ┆ 1 ┆ 1 ┆ Rothes, the Countess. of (Lucy… ┆ … ┆ 86.5 ┆ B77 ┆ S ┆ Rothes │\n", "└─────────────┴──────────┴────────┴─────────────────────────────────┴───┴──────┴───────┴──────────┴─────────┘" ] }, "execution_count": 140, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic[titanic[\"Name\"].str.contains(\"Countess\")]\n", "titanic.filter(\n", " pl.col('Name').str.contains(\"Countess\")\n", ")" ] }, { "cell_type": "code", "execution_count": 141, "id": "65196563-1f24-410e-ae14-4e69edcf211f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (891,)
Name
u32
23
51
21
44
24
21
27
39
21
19
\n", "shape: (891,)
Name
u32
23
51
21
44
24
21
27
39
21
19
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# titanic[\"Name\"].str.len()\n", "row(\n", " titanic[\"Name\"].str.len_chars(),\n", " titanic.select(pl.col('Name').str.len_chars()).to_series()\n", ")" ] }, { "cell_type": "code", "execution_count": 142, "id": "cd737068-09e4-4d1c-9c07-4b6548a40c2d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "307" ] }, "execution_count": 142, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic[\"Name\"].str.len().idxmax()\n", "titanic.select(pl.col('Name').str.len_chars().arg_max()).item()" ] }, { "cell_type": "code", "execution_count": 143, "id": "8aad376d-3aaa-421d-9a40-d009d119a580", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'" ] }, "execution_count": 143, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic.loc[titanic[\"Name\"].str.len().idxmax(), \"Name\"]\n", "titanic.row(\n", " titanic.select(pl.col('Name').str.len_chars().arg_max()).item(), named=True\n", ")['Name']" ] }, { "cell_type": "code", "execution_count": 144, "id": "367bcc8e-add1-4b81-8630-ae18245d7813", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'" ] }, "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic.select(\n", " pl.col('Name').get(pl.col('Name').str.len_chars().arg_max())\n", ").item()" ] }, { "cell_type": "code", "execution_count": 145, "id": "2442d428-cdb8-4c96-a943-59f85b8a3718", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (891,)
Sex_short
str
"M"
"F"
"F"
"F"
"M"
"M"
"F"
"F"
"M"
"M"
" ], "text/plain": [ "shape: (891,)\n", "Series: 'Sex_short' [str]\n", "[\n", "\t\"M\"\n", "\t\"F\"\n", "\t\"F\"\n", "\t\"F\"\n", "\t\"M\"\n", "\t…\n", "\t\"M\"\n", "\t\"F\"\n", "\t\"F\"\n", "\t\"M\"\n", "\t\"M\"\n", "]" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# titanic[\"Sex_short\"] = titanic[\"Sex\"].replace({\"male\": \"M\", \"female\": \"F\"})\n", "titanic = titanic.with_columns(\n", " Sex_short=pl.col('Sex').replace({\"male\": \"M\", \"female\": \"F\"})\n", ")\n", "titanic['Sex_short']" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }