データサイエンス100本ノック 31~40

データサイエンス100本ノック 31~40#

import polars as pl
from helper.polars import load_100knocks_data
pl.Config.set_fmt_str_lengths(100)
df_customer, df_category, df_product, df_receipt, df_store, df_geocode = load_100knocks_data()

P-031#

レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の標準偏差を計算し、降順で5件表示せよ。

(
df_receipt
.group_by('store_cd')
.agg(
    std_amount=pl.col.amount.std(ddof=0)
)
.sort(by='std_amount', descending=True)
.head()
)

shape: (5, 2)

store_cd	std_amount
str	f64
"S13052"	663.391816
"S14011"	553.456916
"S14034"	544.903736
"S13001"	543.536561
"S13015"	543.409938

P-032#

レシート明細データ（df_receipt）の売上金額（amount）について、25％刻みでパーセンタイル値を求めよ。

df_receipt.select(
    [pl.col.amount
     .quantile(per / 100)
     .alias(f'amount_{per}per') 
    for per in [25, 50, 75, 100]
    ]
)

shape: (1, 4)

amount_25per	amount_50per	amount_75per	amount_100per
f64	f64	f64	f64
102.0	170.0	288.0	10925.0

P-033#

レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の平均を計算し、330以上のものを抽出せよ。

(
df_receipt
.group_by('store_cd', maintain_order=True)
.agg(
    avg_amount=pl.col.amount.mean()
)
.filter(pl.col.avg_amount >= 330)
)

shape: (13, 2)

store_cd	avg_amount
str	f64
"S13003"	350.915519
"S13020"	337.879932
"S14026"	332.340588
"S13015"	351.11196
"S14045"	330.082073
…	…
"S14010"	348.791262
"S12013"	330.19413
"S13001"	348.470386
"S14047"	330.077073
"S13052"	402.86747

P-034#

レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに売上金額（amount）を合計して全顧客の平均を求めよ。ただし、顧客IDが”Z”から始まるものは非会員を表すため、除外して計算すること。

(
df_receipt
.filter(
    pl.col.customer_id.str.starts_with('Z').not_()
)
.group_by('customer_id')
.agg(
    pl.col.amount.sum()
)
.select(pl.col.amount.mean())
.item()
)

2547.742234529256

P-035#

レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに売上金額（amount）を合計して全顧客の平均を求め、平均以上に買い物をしている顧客を抽出し、10件表示せよ。ただし、顧客IDが”Z”から始まるものは非会員を表すため、除外して計算すること。

(
df_receipt
.lazy()
.filter(
    pl.col.customer_id.str.starts_with('Z').not_()
)
.group_by('customer_id')
.agg(
    sum_amount = pl.col.amount.sum()
)
.filter(
    pl.col.sum_amount >= pl.col.sum_amount.mean()
)
.collect()
.head(10)
)

shape: (10, 2)

customer_id	sum_amount
str	i64
"CS003515000047"	4424
"CS006515000083"	7115
"CS019415000147"	12764
"CS034214000029"	4570
"CS003505000023"	3778
"CS010415000134"	3442
"CS021514000045"	9741
"CS006414000037"	6416
"CS045615000002"	2888
"CS022515000115"	7146

P-036#

レシート明細データ（df_receipt）と店舗データ（df_store）を内部結合し、レシート明細データの全項目と店舗データの店舗名（store_name）を10件表示せよ。

(
df_receipt
.join(
    df_store.select('store_cd', 'store_name'), 
    on='store_cd'
)
.head(10)
)

shape: (10, 10)

sales_ymd	sales_epoch	store_cd	receipt_no	receipt_sub_no	customer_id	product_cd	quantity	amount	store_name
i64	i64	str	i64	i64	str	str	i64	i64	str
20181103	1541203200	"S14006"	112	1	"CS006214000001"	"P070305012"	1	158	"葛が谷店"
20181118	1542499200	"S13008"	1132	2	"CS008415000097"	"P070701017"	1	81	"成城店"
20170712	1499817600	"S14028"	1102	1	"CS028414000014"	"P060101005"	1	170	"二ツ橋店"
20190205	1549324800	"S14042"	1132	1	"ZZ000000000000"	"P050301001"	1	25	"新山下店"
20180821	1534809600	"S14025"	1102	2	"CS025415000050"	"P060102007"	1	90	"大和店"
20190605	1559692800	"S13003"	1112	1	"CS003515000195"	"P050102002"	1	138	"狛江店"
20181205	1543968000	"S14024"	1102	2	"CS024514000042"	"P080101005"	1	30	"三田店"
20190922	1569110400	"S14040"	1102	1	"CS040415000178"	"P070501004"	1	128	"長津田店"
20170504	1493856000	"S13020"	1112	2	"ZZ000000000000"	"P071302010"	1	770	"十条仲原店"
20191010	1570665600	"S14027"	1102	1	"CS027514000015"	"P071101003"	1	680	"南藤沢店"

P-037#

商品データ（df_product）とカテゴリデータ（df_category）を内部結合し、商品データの全項目とカテゴリデータのカテゴリ小区分名（category_small_name）を10件表示せよ。

(
df_product
.join(
    df_category.select(
        'category_small_cd', 
        'category_small_name'
    ),
    on='category_small_cd'
)
.head(10)
)

shape: (10, 7)

product_cd	category_major_cd	category_medium_cd	category_small_cd	unit_price	unit_cost	category_small_name
str	str	str	str	i64	i64	str
"P040101001"	"04"	"0401"	"040101"	198	149	"弁当類"
"P040101002"	"04"	"0401"	"040101"	218	164	"弁当類"
"P040101003"	"04"	"0401"	"040101"	230	173	"弁当類"
"P040101004"	"04"	"0401"	"040101"	248	186	"弁当類"
"P040101005"	"04"	"0401"	"040101"	268	201	"弁当類"
"P040101006"	"04"	"0401"	"040101"	298	224	"弁当類"
"P040101007"	"04"	"0401"	"040101"	338	254	"弁当類"
"P040101008"	"04"	"0401"	"040101"	420	315	"弁当類"
"P040101009"	"04"	"0401"	"040101"	498	374	"弁当類"
"P040101010"	"04"	"0401"	"040101"	580	435	"弁当類"

P-038#

顧客データ（df_customer）とレシート明細データ（df_receipt）から、顧客ごとの売上金額合計を求め、10件表示せよ。ただし、売上実績がない顧客については売上金額を0として表示させること。また、顧客は性別コード（gender_cd）が女性（1）であるものを対象とし、非会員（顧客IDが”Z”から始まるもの）は除外すること。

(
df_customer
.lazy()
.filter(
    (pl.col.gender_cd == "1") & 
    (pl.col.customer_id.str.starts_with('Z').not_())
)
.select('customer_id')
.join(
    df_receipt
    .lazy()
    .group_by('customer_id')
    .agg(
        sum_amount = pl.col.amount.sum()
    ),
    on='customer_id',
    how='left'
)
.with_columns(
    pl.col.sum_amount.fill_null(0)
)
.head(10)
.collect()
)

shape: (10, 2)

customer_id	sum_amount
str	i64
"CS021313000114"	0
"CS031415000172"	5088
"CS028811000001"	0
"CS001215000145"	875
"CS015414000103"	3122
"CS033513000180"	868
"CS035614000014"	0
"CS011215000048"	3444
"CS009413000079"	0
"CS040412000191"	210

P-039#

レシート明細データ（df_receipt）から、売上日数の多い顧客の上位20件を抽出したデータと、売上金額合計の多い顧客の上位20件を抽出したデータをそれぞれ作成し、さらにその2つを完全外部結合せよ。ただし、非会員（顧客IDが”Z”から始まるもの）は除外すること。

df_data = df_receipt.filter(
    pl.col.customer_id.str.starts_with('Z').not_()
)

group = df_data.group_by('customer_id')

df_cnt = (
    group
    .agg(
        come_days = pl.col.sales_ymd.n_unique()
    )
    .top_k(20, by='come_days')
)

df_sum = (
    group
    .agg(
        buy_amount = pl.col.amount.sum()
    )
    .top_k(20, by='buy_amount')
)

df_cnt.join(df_sum, on='customer_id', how='full', coalesce=True)

shape: (34, 3)

customer_id	come_days	buy_amount
str	u32	i64
"CS017415000097"	20	23086
"CS015415000185"	22	20153
"CS031414000051"	19	19202
"CS028415000007"	21	19127
"CS001605000009"	null	18925
…	…	…
"CS014214000023"	19	null
"CS022515000028"	18	null
"CS010214000002"	21	null
"CS039414000052"	19	null
"CS030214000008"	18	null

P-040#

全ての店舗と全ての商品を組み合わせたデータを作成したい。店舗データ（df_store）と商品データ（df_product）を直積し、件数を計算せよ。

(
df_store
.join(df_product, how='cross')
.select(pl.len())
.item()
)