データサイエンス100本ノック 81~90

データサイエンス100本ノック 81~90#

import polars as pl
from helper.polars import load_100knocks_data
pl.Config.set_fmt_str_lengths(100)
df_customer, df_category, df_product, df_receipt, df_store, df_geocode = load_100knocks_data()

P-081#

単価（unit_price）と原価（unit_cost）の欠損値について、それぞれの平均値で補完した新たな商品データを作成せよ。なお、平均値については1円未満を丸めること（四捨五入または偶数への丸めで良い）。補完実施後、各項目について欠損が生じていないことも確認すること。

(
df_product
.with_columns(
    pl.col.unit_price.fill_null(pl.col.unit_price.mean()),
    pl.col.unit_cost.fill_null(pl.col.unit_cost.mean())
)
.select(pl.all().null_count())
)

shape: (1, 6)

product_cd	category_major_cd	category_medium_cd	category_small_cd	unit_price	unit_cost
u32	u32	u32	u32	u32	u32
0	0	0	0	0	0

P-082#

単価（unit_price）と原価（unit_cost）の欠損値について、それぞれの中央値で補完した新たな商品データを作成せよ。なお、中央値については1円未満を丸めること（四捨五入または偶数への丸めで良い）。補完実施後、各項目について欠損が生じていないことも確認すること。

(
df_product
.with_columns(
    pl.col.unit_price.fill_null(pl.col.unit_price.median()),
    pl.col.unit_cost.fill_null(pl.col.unit_cost.median())
)
.select(pl.all().null_count())
)

shape: (1, 6)

product_cd	category_major_cd	category_medium_cd	category_small_cd	unit_price	unit_cost
u32	u32	u32	u32	u32	u32
0	0	0	0	0	0

P-083#

単価（unit_price）と原価（unit_cost）の欠損値について、各商品のカテゴリ小区分コード（category_small_cd）ごとに算出した中央値で補完した新たな商品データを作成せよ。なお、中央値については1円未満を丸めること（四捨五入または偶数への丸めで良い）。補完実施後、各項目について欠損が生じていないことも確認すること。

cols = pl.col('unit_price', 'unit_cost')
(
df_product
.with_columns(
    pl.when(cols.is_null())
      .then(cols.median())
      .otherwise(cols)
      .over('category_small_cd')
)
.select(pl.all().null_count())
)

shape: (1, 6)

product_cd	category_major_cd	category_medium_cd	category_small_cd	unit_price	unit_cost
u32	u32	u32	u32	u32	u32
0	0	0	0	0	0

P-084#

顧客データ（df_customer）の全顧客に対して全期間の売上金額に占める2019年売上金額の割合を計算し、新たなデータを作成せよ。ただし、売上実績がない場合は0として扱うこと。そして計算した割合が0超のものを抽出し、結果を10件表示せよ。また、作成したデータに欠損が存在しないことを確認せよ。

(
df_customer
.join(
    df_receipt
        .group_by('customer_id')
        .agg(sales_amount_all=pl.col.amount.sum()), 
    on='customer_id', how='left'
)
.join(
    df_receipt
        .filter((pl.col.sales_ymd // 10000) == 2019)
        .group_by('customer_id')
        .agg(sales_amount_2019=pl.col.amount.sum()), 
    on='customer_id', how='left'
)
.select(
    'customer_id',
    pl.col.sales_amount_all.fill_null(0),
    pl.col.sales_amount_2019.fill_null(0)
)
.with_columns(
    sales_rate=(pl.col.sales_amount_2019 / pl.col.sales_amount_all).fill_nan(0)
)
.filter(pl.col.sales_amount_2019 > 0)
.head()
)

shape: (5, 4)

customer_id	sales_amount_all	sales_amount_2019	sales_rate
str	i64	i64	f64
"CS031415000172"	5088	2971	0.583923
"CS015414000103"	3122	874	0.279949
"CS011215000048"	3444	248	0.072009
"CS029415000023"	5167	3767	0.72905
"CS035415000029"	7504	5823	0.775986

P-085#

顧客データ（df_customer）の全顧客に対し、郵便番号（postal_cd）を用いてジオコードデータ（df_geocode）を紐付け、新たな顧客データを作成せよ。ただし、1つの郵便番号（postal_cd）に複数の経度（longitude）、緯度（latitude）情報が紐づく場合は、経度（longitude）、緯度（latitude）の平均値を算出して使用すること。また、作成結果を確認するために結果を10件表示せよ。

df_loc = (
    df_geocode
    .group_by(pl.col.postal_cd)
    .agg(
        pl.col.longitude.mean(),
        pl.col.latitude.mean()
    )
)

df_customer_loc = (
    df_customer.join(df_loc, on='postal_cd', how='left')
)

df_customer_loc.head()

shape: (5, 13)

customer_id	customer_name	gender_cd	gender	birth_day	age	postal_cd	address	application_store_cd	application_date	status_cd	longitude	latitude
str	str	str	str	str	i64	str	str	str	i64	str	f64	f64
"CS021313000114"	"大野あや子"	"1"	"女性"	"1981-04-29"	37	"259-1113"	"神奈川県伊勢原市粟窪**********"	"S14021"	20150905	"0-00000000-0"	139.31779	35.41358
"CS037613000071"	"六角雅彦"	"9"	"不明"	"1952-04-01"	66	"136-0076"	"東京都江東区南砂**********"	"S13037"	20150414	"0-00000000-0"	139.83502	35.67193
"CS031415000172"	"宇多田貴美子"	"1"	"女性"	"1976-10-04"	42	"151-0053"	"東京都渋谷区代々木**********"	"S13031"	20150529	"D-20100325-C"	139.68965	35.67374
"CS028811000001"	"堀井かおり"	"1"	"女性"	"1933-03-27"	86	"245-0016"	"神奈川県横浜市泉区和泉町**********"	"S14028"	20160115	"0-00000000-0"	139.4836	35.39125
"CS001215000145"	"田崎美紀"	"1"	"女性"	"1995-03-29"	24	"144-0055"	"東京都大田区仲六郷**********"	"S13001"	20170605	"6-20090929-2"	139.70775	35.54084

P-086#

085で作成した緯度経度つき顧客データに対し、会員申込店舗コード（application_store_cd）をキーに店舗データ（df_store）と結合せよ。そして申込み店舗の緯度（latitude）・経度情報（longitude)と顧客住所（address）の緯度・経度を用いて申込み店舗と顧客住所の距離（単位：km）を求め、顧客ID（customer_id）、顧客住所（address）、店舗住所（address）とともに表示せよ。計算式は以下の簡易式で良いものとするが、その他精度の高い方式を利用したライブラリを利用してもかまわない。結果は10件表示せよ。

\[\begin{split} \mbox{緯度（ラジアン）}：\phi \\ \mbox{経度（ラジアン）}：\lambda \\ \mbox{距離}L = 6371 * \arccos(\sin \phi_1 * \sin \phi_2 + \cos \phi_1 * \cos \phi_2 * \cos(\lambda_1 − \lambda_2)) \end{split}\]

p1 = pl.col.latitude.radians()
p2 = pl.col.latitude_right.radians()
l1 = pl.col.longitude.radians()
l2 = pl.col.longitude.radians()
distance = (p1.sin() * p2.sin() + p1.cos() * p2.cos() * (l1 - l1).cos()).arccos() * 6371
(
df_customer_loc
.join(
    df_store,
    left_on='application_store_cd',
    right_on='store_cd',
    how='left'
)
.with_columns(distance=distance)
.select(
    'customer_id',
    pl.col.address.alias('customer_address'),
    pl.col.address_right.alias('store_address'),
    'distance'
)
.head()
)

shape: (5, 4)

customer_id	customer_address	store_address	distance
str	str	str	f64
"CS021313000114"	"神奈川県伊勢原市粟窪**********"	"神奈川県伊勢原市伊勢原四丁目"	1.322108
"CS037613000071"	"東京都江東区南砂**********"	"東京都江東区南砂一丁目"	0.783924
"CS031415000172"	"東京都渋谷区代々木**********"	"東京都渋谷区初台二丁目"	0.269092
"CS028811000001"	"神奈川県横浜市泉区和泉町**********"	"神奈川県横浜市瀬谷区二ツ橋町"	7.982684
"CS001215000145"	"東京都大田区仲六郷**********"	"東京都大田区仲六郷二丁目"	1.168659

P-087#

顧客データ（df_customer）では、異なる店舗での申込みなどにより同一顧客が複数登録されている。名前（customer_name）と郵便番号（postal_cd）が同じ顧客は同一顧客とみなして1顧客1レコードとなるように名寄せした名寄顧客データを作成し、顧客データの件数、名寄顧客データの件数、重複数を算出せよ。ただし、同一顧客に対しては売上金額合計が最も高いものを残し、売上金額合計が同一もしくは売上実績がない顧客については顧客ID（customer_id）の番号が小さいものを残すこととする。

df_receipt_sum = (
    df_receipt
    .group_by('customer_id')
    .agg(pl.col.amount.sum())
)

df_unique_customer = (
    df_customer
    .join(df_receipt_sum, on='customer_id', how='left')
    .with_columns(pl.col.amount.fill_null(0))
    .group_by('customer_name', 'postal_cd')
    .agg(
        pl.col('customer_id', 'amount')
        .sort_by('amount', 'customer_id', descending=[True, False])
        .first()
    )
)

df_unique_customer.head()

shape: (5, 4)

customer_name	postal_cd	customer_id	amount
str	str	str	i64
"大河内恵麻"	"279-0004"	"CS038413000133"	546
"今泉公顕"	"151-0065"	"CS031513000184"	1244
"米倉ヒロ"	"273-0035"	"CS030302000015"	0
"岩本美菜"	"240-0113"	"CS022315000148"	0
"小宮みき"	"272-0021"	"CS030613000001"	0

df_customer.shape[0], df_unique_customer.shape[0]

(21971, 21941)

P-088#

087で作成したデータを元に、顧客データに統合名寄IDを付与したデータを作成せよ。ただし、統合名寄IDは以下の仕様で付与するものとする。

重複していない顧客：顧客ID（customer_id）を設定

重複している顧客：前設問で抽出したレコードの顧客IDを設定

顧客IDのユニーク件数と、統合名寄IDのユニーク件数の差も確認すること。

df_customer_integration = (
    df_customer
    .join(
        df_unique_customer
        .select(
            'customer_name', 'postal_cd', pl.col.customer_id.alias('integration_id')
        ),
    on=['customer_name', 'postal_cd'], 
    how='left',
    )
)
df_customer_integration.head()

shape: (5, 12)

customer_id	customer_name	gender_cd	gender	birth_day	age	postal_cd	address	application_store_cd	application_date	status_cd	integration_id
str	str	str	str	str	i64	str	str	str	i64	str	str
"CS021313000114"	"大野あや子"	"1"	"女性"	"1981-04-29"	37	"259-1113"	"神奈川県伊勢原市粟窪**********"	"S14021"	20150905	"0-00000000-0"	"CS021313000114"
"CS037613000071"	"六角雅彦"	"9"	"不明"	"1952-04-01"	66	"136-0076"	"東京都江東区南砂**********"	"S13037"	20150414	"0-00000000-0"	"CS037613000071"
"CS031415000172"	"宇多田貴美子"	"1"	"女性"	"1976-10-04"	42	"151-0053"	"東京都渋谷区代々木**********"	"S13031"	20150529	"D-20100325-C"	"CS031415000172"
"CS028811000001"	"堀井かおり"	"1"	"女性"	"1933-03-27"	86	"245-0016"	"神奈川県横浜市泉区和泉町**********"	"S14028"	20160115	"0-00000000-0"	"CS028811000001"
"CS001215000145"	"田崎美紀"	"1"	"女性"	"1995-03-29"	24	"144-0055"	"東京都大田区仲六郷**********"	"S13001"	20170605	"6-20090929-2"	"CS001215000145"

(
df_customer_integration
.filter(pl.col.customer_id != pl.col.integration_id)
.shape[0]
)

P-089#

売上実績がある顧客を、予測モデル構築のため学習用データとテスト用データに分割したい。それぞれ8:2の割合でランダムにデータを分割せよ。

df_train, df_test = (
    df_receipt
    .group_by('customer_id')
    .agg(pl.col.amount.sum())
    .join(df_customer, on='customer_id', how='left')
    .with_columns(
        split=pl.int_range(pl.len()).shuffle() < pl.len() * 0.2
    )
    .partition_by('split')
)
df_train.shape[0], df_test.shape[0]

(6645, 1662)

P-090#

レシート明細データ（df_receipt）は2017年1月1日〜2019年10月31日までのデータを有している。売上金額（amount）を月次で集計し、学習用に12ヶ月、テスト用に6ヶ月の時系列モデル構築用データを3セット作成せよ。

df_tmp = (
df_receipt
.group_by(
    (pl.col.sales_ymd // 100).alias('sales_ym')
)
.agg(sum_amount=pl.col.amount.sum())
.sort(by='sales_ym')
)

group = (
df_tmp
.with_row_index()
.group_by_dynamic(
    pl.col.index.cast(pl.Int32), every='6i', period='18i', offset='0i'
)
)

dfs = [df for _, df in group][:3]
dfs = [
    df
    .with_columns(
        test_flag=(pl.int_range(pl.len()) > 12).cast(pl.UInt8)
    )
    .select(pl.exclude('index'))
    for df in dfs
]

dfs

[shape: (18, 3)
 ┌──────────┬────────────┬───────────┐
 │ sales_ym ┆ sum_amount ┆ test_flag │
 │ ---      ┆ ---        ┆ ---       │
 │ i64      ┆ i64        ┆ u8        │
 ╞══════════╪════════════╪═══════════╡
 │ 201701   ┆ 902056     ┆ 0         │
 │ 201702   ┆ 764413     ┆ 0         │
 │ 201703   ┆ 962945     ┆ 0         │
 │ 201704   ┆ 847566     ┆ 0         │
 │ 201705   ┆ 884010     ┆ 0         │
 │ …        ┆ …          ┆ …         │
 │ 201802   ┆ 864128     ┆ 1         │
 │ 201803   ┆ 946588     ┆ 1         │
 │ 201804   ┆ 937099     ┆ 1         │
 │ 201805   ┆ 1004438    ┆ 1         │
 │ 201806   ┆ 1012329    ┆ 1         │
 └──────────┴────────────┴───────────┘,
 shape: (18, 3)
 ┌──────────┬────────────┬───────────┐
 │ sales_ym ┆ sum_amount ┆ test_flag │
 │ ---      ┆ ---        ┆ ---       │
 │ i64      ┆ i64        ┆ u8        │
 ╞══════════╪════════════╪═══════════╡
 │ 201707   ┆ 959205     ┆ 0         │
 │ 201708   ┆ 954836     ┆ 0         │
 │ 201709   ┆ 902037     ┆ 0         │
 │ 201710   ┆ 905739     ┆ 0         │
 │ 201711   ┆ 932157     ┆ 0         │
 │ …        ┆ …          ┆ …         │
 │ 201808   ┆ 1045793    ┆ 1         │
 │ 201809   ┆ 977114     ┆ 1         │
 │ 201810   ┆ 1069939    ┆ 1         │
 │ 201811   ┆ 967479     ┆ 1         │
 │ 201812   ┆ 1016425    ┆ 1         │
 └──────────┴────────────┴───────────┘,
 shape: (18, 3)
 ┌──────────┬────────────┬───────────┐
 │ sales_ym ┆ sum_amount ┆ test_flag │
 │ ---      ┆ ---        ┆ ---       │
 │ i64      ┆ i64        ┆ u8        │
 ╞══════════╪════════════╪═══════════╡
 │ 201801   ┆ 944509     ┆ 0         │
 │ 201802   ┆ 864128     ┆ 0         │
 │ 201803   ┆ 946588     ┆ 0         │
 │ 201804   ┆ 937099     ┆ 0         │
 │ 201805   ┆ 1004438    ┆ 0         │
 │ …        ┆ …          ┆ …         │
 │ 201902   ┆ 959538     ┆ 1         │
 │ 201903   ┆ 1093753    ┆ 1         │
 │ 201904   ┆ 1044210    ┆ 1         │
 │ 201905   ┆ 1111985    ┆ 1         │
 │ 201906   ┆ 1089063    ┆ 1         │
 └──────────┴────────────┴───────────┘]

データサイエンス100本ノック 81~90

Contents

データサイエンス100本ノック 81~90#

P-081#

P-082#

P-083#

P-084#

P-085#

P-086#

P-087#

P-088#

P-089#

P-090#