データサイエンス100本ノック 21~30

データサイエンス100本ノック 21~30#

import polars as pl
from helper.polars import load_100knocks_data
pl.Config.set_fmt_str_lengths(100)
df_customer, df_category, df_product, df_receipt, df_store, df_geocode = load_100knocks_data()

P-021#

レシート明細データ（df_receipt）に対し、件数をカウントせよ。

df_receipt.shape[0]

len(df_receipt)

P-022#

レシート明細データ（df_receipt）の顧客ID（customer_id）に対し、ユニーク件数をカウントせよ。

(
df_receipt
.select(pl.col.customer_id.n_unique())
.item()
)

P-023#

レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）と売上数量（quantity）を合計せよ。

(
df_receipt
.group_by('store_cd')
.agg(pl.col('amount', 'quantity').sum())
.sort(by='store_cd')
.head()
)

shape: (5, 3)

store_cd	amount	quantity
str	i64	i64
"S12007"	638761	2099
"S12013"	787513	2425
"S12014"	725167	2358
"S12029"	794741	2555
"S12030"	684402	2403

P-024#

レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに最も新しい売上年月日（sales_ymd）を求め、10件表示せよ。

(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(pl.col('sales_ymd').max())
.head()
)

shape: (5, 2)

customer_id	sales_ymd
str	i64
"CS006214000001"	20190908
"CS008415000097"	20190417
"CS028414000014"	20191023
"ZZ000000000000"	20191031
"CS025415000050"	20191008

(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(
    pl.all()
        .sort_by('sales_ymd', descending=True)
        .first()
)
.head()
)

shape: (5, 9)

customer_id	sales_ymd	sales_epoch	store_cd	receipt_no	receipt_sub_no	product_cd	quantity	amount
str	i64	i64	str	i64	i64	str	i64	i64
"CS006214000001"	20190908	1567900800	"S14006"	112	1	"P071302010"	1	770
"CS008415000097"	20190417	1555459200	"S13008"	1192	1	"P090405001"	1	200
"CS028414000014"	20191023	1571788800	"S14028"	1122	2	"P070301019"	1	225
"ZZ000000000000"	20191031	1572480000	"S13019"	112	1	"P080102015"	1	30
"CS025415000050"	20191008	1570492800	"S14025"	1182	2	"P080801009"	1	258

(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(
    pl.all()
    .get(pl.col.sales_ymd.arg_max())
)
.head()
)

shape: (5, 9)

customer_id	sales_ymd	sales_epoch	store_cd	receipt_no	receipt_sub_no	product_cd	quantity	amount
str	i64	i64	str	i64	i64	str	i64	i64
"CS006214000001"	20190908	1567900800	"S14006"	112	1	"P071302010"	1	770
"CS008415000097"	20190417	1555459200	"S13008"	1192	1	"P090405001"	1	200
"CS028414000014"	20191023	1571788800	"S14028"	1122	2	"P070301019"	1	225
"ZZ000000000000"	20191031	1572480000	"S13019"	112	1	"P080102015"	1	30
"CS025415000050"	20191008	1570492800	"S14025"	1182	2	"P080801009"	1	258

P-025#

レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに最も古い売上年月日（sales_ymd）を求め、10件表示せよ。

(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(pl.col('sales_ymd').min())
.head()
)

shape: (5, 2)

customer_id	sales_ymd
str	i64
"CS006214000001"	20170509
"CS008415000097"	20170328
"CS028414000014"	20170403
"ZZ000000000000"	20170101
"CS025415000050"	20170619

P-026#

レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに最も新しい売上年月日（sales_ymd）と古い売上年月日を求め、両者が異なるデータを10件表示せよ。

(
df_receipt
.lazy()
.group_by('customer_id', maintain_order=True)
.agg(
    pl.col.sales_ymd.min().alias('sales_ymd_min'),
    pl.col.sales_ymd.max().alias('sales_ymd_max'),    
)
.filter(
    pl.col.sales_ymd_min != pl.col.sales_ymd_max
)
.head(5)
.collect()
)

shape: (5, 3)

customer_id	sales_ymd_min	sales_ymd_max
str	i64	i64
"CS006214000001"	20170509	20190908
"CS008415000097"	20170328	20190417
"CS028414000014"	20170403	20191023
"ZZ000000000000"	20170101	20191031
"CS025415000050"	20170619	20191008

P-027#

レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の平均を計算し、降順でTOP5を表示せよ。

(
df_receipt
.group_by('store_cd')
.agg(pl.col.amount.mean())
.sort(by='amount', descending=True)
.head(5)
)

shape: (5, 2)

store_cd	amount
str	f64
"S13052"	402.86747
"S13015"	351.11196
"S13003"	350.915519
"S14010"	348.791262
"S13001"	348.470386

P-028#

レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の中央値を計算し、降順でTOP5を表示せよ。

(
df_receipt
.group_by('store_cd')
.agg(pl.col.amount.median())
.sort(by='amount', descending=True)
.head(5)
)

shape: (5, 2)

store_cd	amount
str	f64
"S13052"	190.0
"S14010"	188.0
"S14050"	185.0
"S13018"	180.0
"S13003"	180.0

P-029#

レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに商品コード（product_cd）の最頻値を求め、10件表示させよ。

mode = pl.col.product_cd.mode()
(
df_receipt
.group_by('store_cd')
.agg(
    mode,
    (pl.col.product_cd == mode.first())
        .sum()
        .alias('count')
)
.sort(by='store_cd')
.head(10)
)

shape: (10, 3)

store_cd	product_cd	count
str	list[str]	u32
"S12007"	["P060303001"]	72
"S12013"	["P060303001"]	107
"S12014"	["P060303001"]	65
"S12029"	["P060303001"]	92
"S12030"	["P060303001"]	115
"S13001"	["P060303001"]	67
"S13002"	["P060303001"]	78
"S13003"	["P071401001"]	65
"S13004"	["P060303001"]	88
"S13005"	["P040503001"]	36

(
df_receipt
.group_by('store_cd', 'product_cd')
.agg(pl.count())
.filter(
    (pl.col.count == pl.col.count.max()).over('store_cd')
)
.sort(by='store_cd')
.head(10)
)

C:\Users\ruoyu\AppData\Local\Temp\ipykernel_25276\4003027574.py:4: DeprecationWarning: `pl.count()` is deprecated. Please use `pl.len()` instead.
  .agg(pl.count())

shape: (10, 3)

store_cd	product_cd	count
str	str	u32
"S12007"	"P060303001"	72
"S12013"	"P060303001"	107
"S12014"	"P060303001"	65
"S12029"	"P060303001"	92
"S12030"	"P060303001"	115
"S13001"	"P060303001"	67
"S13002"	"P060303001"	78
"S13003"	"P071401001"	65
"S13004"	"P060303001"	88
"S13005"	"P040503001"	36

P-030#

レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の分散を計算し、降順で5件表示せよ。

(
df_receipt
.group_by('store_cd')
.agg(
    vars_amount=pl.col.amount.var(ddof=0)
)
.sort(by='vars_amount', descending=True)
.head()
)

shape: (5, 2)

store_cd	vars_amount
str	f64
"S13052"	440088.701311
"S14011"	306314.558164
"S14034"	296920.081011
"S13001"	295431.993329
"S13015"	295294.361116

データサイエンス100本ノック 21~30

Contents

データサイエンス100本ノック 21~30#

P-021#

P-022#

P-023#

P-024#

P-025#

P-026#

P-027#

P-028#

P-029#

P-030#