データサイエンス100本ノック 21~30#
import polars as pl
from helper.polars import load_100knocks_data
pl.Config.set_fmt_str_lengths(100)
df_customer, df_category, df_product, df_receipt, df_store, df_geocode = load_100knocks_data()
P-021#
レシート明細データ(df_receipt)に対し、件数をカウントせよ。
df_receipt.shape[0]
104681
len(df_receipt)
104681
P-022#
レシート明細データ(df_receipt)の顧客ID(customer_id)に対し、ユニーク件数をカウントせよ。
(
df_receipt
.select(pl.col.customer_id.n_unique())
.item()
)
8307
P-023#
レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに売上金額(amount)と売上数量(quantity)を合計せよ。
(
df_receipt
.group_by('store_cd')
.agg(pl.col('amount', 'quantity').sum())
.sort(by='store_cd')
.head()
)
shape: (5, 3)
store_cd | amount | quantity |
---|---|---|
str | i64 | i64 |
"S12007" | 638761 | 2099 |
"S12013" | 787513 | 2425 |
"S12014" | 725167 | 2358 |
"S12029" | 794741 | 2555 |
"S12030" | 684402 | 2403 |
P-024#
レシート明細データ(df_receipt)に対し、顧客ID(customer_id)ごとに最も新しい売上年月日(sales_ymd)を求め、10件表示せよ。
(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(pl.col('sales_ymd').max())
.head()
)
shape: (5, 2)
customer_id | sales_ymd |
---|---|
str | i64 |
"CS006214000001" | 20190908 |
"CS008415000097" | 20190417 |
"CS028414000014" | 20191023 |
"ZZ000000000000" | 20191031 |
"CS025415000050" | 20191008 |
(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(
pl.all()
.sort_by('sales_ymd', descending=True)
.first()
)
.head()
)
shape: (5, 9)
customer_id | sales_ymd | sales_epoch | store_cd | receipt_no | receipt_sub_no | product_cd | quantity | amount |
---|---|---|---|---|---|---|---|---|
str | i64 | i64 | str | i64 | i64 | str | i64 | i64 |
"CS006214000001" | 20190908 | 1567900800 | "S14006" | 112 | 1 | "P071302010" | 1 | 770 |
"CS008415000097" | 20190417 | 1555459200 | "S13008" | 1192 | 1 | "P090405001" | 1 | 200 |
"CS028414000014" | 20191023 | 1571788800 | "S14028" | 1122 | 2 | "P070301019" | 1 | 225 |
"ZZ000000000000" | 20191031 | 1572480000 | "S13019" | 112 | 1 | "P080102015" | 1 | 30 |
"CS025415000050" | 20191008 | 1570492800 | "S14025" | 1182 | 2 | "P080801009" | 1 | 258 |
(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(
pl.all()
.get(pl.col.sales_ymd.arg_max())
)
.head()
)
shape: (5, 9)
customer_id | sales_ymd | sales_epoch | store_cd | receipt_no | receipt_sub_no | product_cd | quantity | amount |
---|---|---|---|---|---|---|---|---|
str | i64 | i64 | str | i64 | i64 | str | i64 | i64 |
"CS006214000001" | 20190908 | 1567900800 | "S14006" | 112 | 1 | "P071302010" | 1 | 770 |
"CS008415000097" | 20190417 | 1555459200 | "S13008" | 1192 | 1 | "P090405001" | 1 | 200 |
"CS028414000014" | 20191023 | 1571788800 | "S14028" | 1122 | 2 | "P070301019" | 1 | 225 |
"ZZ000000000000" | 20191031 | 1572480000 | "S13019" | 112 | 1 | "P080102015" | 1 | 30 |
"CS025415000050" | 20191008 | 1570492800 | "S14025" | 1182 | 2 | "P080801009" | 1 | 258 |
P-025#
レシート明細データ(df_receipt)に対し、顧客ID(customer_id)ごとに最も古い売上年月日(sales_ymd)を求め、10件表示せよ。
(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(pl.col('sales_ymd').min())
.head()
)
shape: (5, 2)
customer_id | sales_ymd |
---|---|
str | i64 |
"CS006214000001" | 20170509 |
"CS008415000097" | 20170328 |
"CS028414000014" | 20170403 |
"ZZ000000000000" | 20170101 |
"CS025415000050" | 20170619 |
P-026#
レシート明細データ(df_receipt)に対し、顧客ID(customer_id)ごとに最も新しい売上年月日(sales_ymd)と古い売上年月日を求め、両者が異なるデータを10件表示せよ。
(
df_receipt
.lazy()
.group_by('customer_id', maintain_order=True)
.agg(
pl.col.sales_ymd.min().alias('sales_ymd_min'),
pl.col.sales_ymd.max().alias('sales_ymd_max'),
)
.filter(
pl.col.sales_ymd_min != pl.col.sales_ymd_max
)
.head(5)
.collect()
)
shape: (5, 3)
customer_id | sales_ymd_min | sales_ymd_max |
---|---|---|
str | i64 | i64 |
"CS006214000001" | 20170509 | 20190908 |
"CS008415000097" | 20170328 | 20190417 |
"CS028414000014" | 20170403 | 20191023 |
"ZZ000000000000" | 20170101 | 20191031 |
"CS025415000050" | 20170619 | 20191008 |
P-027#
レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに売上金額(amount)の平均を計算し、降順でTOP5を表示せよ。
(
df_receipt
.group_by('store_cd')
.agg(pl.col.amount.mean())
.sort(by='amount', descending=True)
.head(5)
)
shape: (5, 2)
store_cd | amount |
---|---|
str | f64 |
"S13052" | 402.86747 |
"S13015" | 351.11196 |
"S13003" | 350.915519 |
"S14010" | 348.791262 |
"S13001" | 348.470386 |
P-028#
レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに売上金額(amount)の中央値を計算し、降順でTOP5を表示せよ。
(
df_receipt
.group_by('store_cd')
.agg(pl.col.amount.median())
.sort(by='amount', descending=True)
.head(5)
)
shape: (5, 2)
store_cd | amount |
---|---|
str | f64 |
"S13052" | 190.0 |
"S14010" | 188.0 |
"S14050" | 185.0 |
"S13018" | 180.0 |
"S13003" | 180.0 |
P-029#
レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに商品コード(product_cd)の最頻値を求め、10件表示させよ。
mode = pl.col.product_cd.mode()
(
df_receipt
.group_by('store_cd')
.agg(
mode,
(pl.col.product_cd == mode.first())
.sum()
.alias('count')
)
.sort(by='store_cd')
.head(10)
)
shape: (10, 3)
store_cd | product_cd | count |
---|---|---|
str | list[str] | u32 |
"S12007" | ["P060303001"] | 72 |
"S12013" | ["P060303001"] | 107 |
"S12014" | ["P060303001"] | 65 |
"S12029" | ["P060303001"] | 92 |
"S12030" | ["P060303001"] | 115 |
"S13001" | ["P060303001"] | 67 |
"S13002" | ["P060303001"] | 78 |
"S13003" | ["P071401001"] | 65 |
"S13004" | ["P060303001"] | 88 |
"S13005" | ["P040503001"] | 36 |
(
df_receipt
.group_by('store_cd', 'product_cd')
.agg(pl.count())
.filter(
(pl.col.count == pl.col.count.max()).over('store_cd')
)
.sort(by='store_cd')
.head(10)
)
C:\Users\ruoyu\AppData\Local\Temp\ipykernel_25276\4003027574.py:4: DeprecationWarning: `pl.count()` is deprecated. Please use `pl.len()` instead.
.agg(pl.count())
shape: (10, 3)
store_cd | product_cd | count |
---|---|---|
str | str | u32 |
"S12007" | "P060303001" | 72 |
"S12013" | "P060303001" | 107 |
"S12014" | "P060303001" | 65 |
"S12029" | "P060303001" | 92 |
"S12030" | "P060303001" | 115 |
"S13001" | "P060303001" | 67 |
"S13002" | "P060303001" | 78 |
"S13003" | "P071401001" | 65 |
"S13004" | "P060303001" | 88 |
"S13005" | "P040503001" | 36 |
P-030#
レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに売上金額(amount)の分散を計算し、降順で5件表示せよ。
(
df_receipt
.group_by('store_cd')
.agg(
vars_amount=pl.col.amount.var(ddof=0)
)
.sort(by='vars_amount', descending=True)
.head()
)
shape: (5, 2)
store_cd | vars_amount |
---|---|
str | f64 |
"S13052" | 440088.701311 |
"S14011" | 306314.558164 |
"S14034" | 296920.081011 |
"S13001" | 295431.993329 |
"S13015" | 295294.361116 |