データサイエンス100本ノック 21~30

データサイエンス100本ノック 21~30#

import polars as pl
from helper.polars import load_100knocks_data
pl.Config.set_fmt_str_lengths(100)
df_customer, df_category, df_product, df_receipt, df_store, df_geocode = load_100knocks_data()

P-021#

レシート明細データ(df_receipt)に対し、件数をカウントせよ。

df_receipt.shape[0]
104681
len(df_receipt)
104681

P-022#

レシート明細データ(df_receipt)の顧客ID(customer_id)に対し、ユニーク件数をカウントせよ。

(
df_receipt
.select(pl.col.customer_id.n_unique())
.item()
)
8307

P-023#

レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに売上金額(amount)と売上数量(quantity)を合計せよ。

(
df_receipt
.group_by('store_cd')
.agg(pl.col('amount', 'quantity').sum())
.sort(by='store_cd')
.head()
)
shape: (5, 3)
store_cdamountquantity
stri64i64
"S12007"6387612099
"S12013"7875132425
"S12014"7251672358
"S12029"7947412555
"S12030"6844022403

P-024#

レシート明細データ(df_receipt)に対し、顧客ID(customer_id)ごとに最も新しい売上年月日(sales_ymd)を求め、10件表示せよ。

(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(pl.col('sales_ymd').max())
.head()
)
shape: (5, 2)
customer_idsales_ymd
stri64
"CS006214000001"20190908
"CS008415000097"20190417
"CS028414000014"20191023
"ZZ000000000000"20191031
"CS025415000050"20191008
(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(
    pl.all()
        .sort_by('sales_ymd', descending=True)
        .first()
)
.head()
)
shape: (5, 9)
customer_idsales_ymdsales_epochstore_cdreceipt_noreceipt_sub_noproduct_cdquantityamount
stri64i64stri64i64stri64i64
"CS006214000001"201909081567900800"S14006"1121"P071302010"1770
"CS008415000097"201904171555459200"S13008"11921"P090405001"1200
"CS028414000014"201910231571788800"S14028"11222"P070301019"1225
"ZZ000000000000"201910311572480000"S13019"1121"P080102015"130
"CS025415000050"201910081570492800"S14025"11822"P080801009"1258
(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(
    pl.all()
    .get(pl.col.sales_ymd.arg_max())
)
.head()
)
shape: (5, 9)
customer_idsales_ymdsales_epochstore_cdreceipt_noreceipt_sub_noproduct_cdquantityamount
stri64i64stri64i64stri64i64
"CS006214000001"201909081567900800"S14006"1121"P071302010"1770
"CS008415000097"201904171555459200"S13008"11921"P090405001"1200
"CS028414000014"201910231571788800"S14028"11222"P070301019"1225
"ZZ000000000000"201910311572480000"S13019"1121"P080102015"130
"CS025415000050"201910081570492800"S14025"11822"P080801009"1258

P-025#

レシート明細データ(df_receipt)に対し、顧客ID(customer_id)ごとに最も古い売上年月日(sales_ymd)を求め、10件表示せよ。

(
df_receipt
.group_by(pl.col.customer_id, maintain_order=True)
.agg(pl.col('sales_ymd').min())
.head()
)
shape: (5, 2)
customer_idsales_ymd
stri64
"CS006214000001"20170509
"CS008415000097"20170328
"CS028414000014"20170403
"ZZ000000000000"20170101
"CS025415000050"20170619

P-026#

レシート明細データ(df_receipt)に対し、顧客ID(customer_id)ごとに最も新しい売上年月日(sales_ymd)と古い売上年月日を求め、両者が異なるデータを10件表示せよ。

(
df_receipt
.lazy()
.group_by('customer_id', maintain_order=True)
.agg(
    pl.col.sales_ymd.min().alias('sales_ymd_min'),
    pl.col.sales_ymd.max().alias('sales_ymd_max'),    
)
.filter(
    pl.col.sales_ymd_min != pl.col.sales_ymd_max
)
.head(5)
.collect()
)
shape: (5, 3)
customer_idsales_ymd_minsales_ymd_max
stri64i64
"CS006214000001"2017050920190908
"CS008415000097"2017032820190417
"CS028414000014"2017040320191023
"ZZ000000000000"2017010120191031
"CS025415000050"2017061920191008

P-027#

レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに売上金額(amount)の平均を計算し、降順でTOP5を表示せよ。

(
df_receipt
.group_by('store_cd')
.agg(pl.col.amount.mean())
.sort(by='amount', descending=True)
.head(5)
)
shape: (5, 2)
store_cdamount
strf64
"S13052"402.86747
"S13015"351.11196
"S13003"350.915519
"S14010"348.791262
"S13001"348.470386

P-028#

レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに売上金額(amount)の中央値を計算し、降順でTOP5を表示せよ。

(
df_receipt
.group_by('store_cd')
.agg(pl.col.amount.median())
.sort(by='amount', descending=True)
.head(5)
)
shape: (5, 2)
store_cdamount
strf64
"S13052"190.0
"S14010"188.0
"S14050"185.0
"S13018"180.0
"S13003"180.0

P-029#

レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに商品コード(product_cd)の最頻値を求め、10件表示させよ。

mode = pl.col.product_cd.mode()
(
df_receipt
.group_by('store_cd')
.agg(
    mode,
    (pl.col.product_cd == mode.first())
        .sum()
        .alias('count')
)
.sort(by='store_cd')
.head(10)
)
shape: (10, 3)
store_cdproduct_cdcount
strlist[str]u32
"S12007"["P060303001"]72
"S12013"["P060303001"]107
"S12014"["P060303001"]65
"S12029"["P060303001"]92
"S12030"["P060303001"]115
"S13001"["P060303001"]67
"S13002"["P060303001"]78
"S13003"["P071401001"]65
"S13004"["P060303001"]88
"S13005"["P040503001"]36
(
df_receipt
.group_by('store_cd', 'product_cd')
.agg(pl.count())
.filter(
    (pl.col.count == pl.col.count.max()).over('store_cd')
)
.sort(by='store_cd')
.head(10)
)
C:\Users\ruoyu\AppData\Local\Temp\ipykernel_25276\4003027574.py:4: DeprecationWarning: `pl.count()` is deprecated. Please use `pl.len()` instead.
  .agg(pl.count())
shape: (10, 3)
store_cdproduct_cdcount
strstru32
"S12007""P060303001"72
"S12013""P060303001"107
"S12014""P060303001"65
"S12029""P060303001"92
"S12030""P060303001"115
"S13001""P060303001"67
"S13002""P060303001"78
"S13003""P071401001"65
"S13004""P060303001"88
"S13005""P040503001"36

P-030#

レシート明細データ(df_receipt)に対し、店舗コード(store_cd)ごとに売上金額(amount)の分散を計算し、降順で5件表示せよ。

(
df_receipt
.group_by('store_cd')
.agg(
    vars_amount=pl.col.amount.var(ddof=0)
)
.sort(by='vars_amount', descending=True)
.head()
)
shape: (5, 2)
store_cdvars_amount
strf64
"S13052"440088.701311
"S14011"306314.558164
"S14034"296920.081011
"S13001"295431.993329
"S13015"295294.361116