
[Python][AI] ํ๊ตญ ๋ก๋ ๋ถ์: ๋น์ฒจ ํ๋ฅ ๊ณผ ์์ธก์ ๋ถ๊ฐ๋ฅ์ฑ
1. ํ๊ตญ ๋ก๋ ๊ฐ๋ ๋ก๋๋ 1๋ถํฐ 45๊น์ง์ ์ซ์ ์ค 6๊ฐ๋ฅผ ์ ํํ๋ ๋ฐฉ์์ ๋ณต๊ถ์ด๋ค. ๊ตฌ๋งค์๋ ๋ค์๊ณผ ๊ฐ์ ๋ฐฉ๋ฒ์ผ๋ก ๋ฒํธ๋ฅผ ์ ํํ ์ ์๋ค.์๋ ์ ํ: 45๊ฐ์ ๋ฒํธ ์ค 6๊ฐ ๋ฒํธ๋ฅผ ๋ฌด์์๋ก ๋ถ์ฌ๋ฐ
5hr1rnp.tistory.com
๋ณธ ๊ธ์์๋ ๋ก๋ ๋น์ฒจ ๋ฐ์ดํฐ์ ๋ํ ํ์์ ๋ฐ์ดํฐ ๋ถ์(EDA)์ ์งํํ๋ฉฐ, ๋ค์ํ ํต๊ณ์ ํน์ง์ ์ดํด๋ณด๊ฒ ๋ค.
1. ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
๋ก๋ ๋ฐ์ดํฐ๋ ๋ํ๋ณต๊ถ ์ฌ์ดํธ์์ ์ ๊ณตํ๋ ์์ ํ์ผ์ ๊ฐ๊ฐ lotto-1.xls(1~600ํ), lotto-2.xls(601~1,159ํ)๋ก ์ ์ฅํจ ('25. 02. 18. ๊ธฐ์ค).
๋ ๋ฐ์ดํฐ๋ฅผ Python์ผ๋ก ๋ถ๋ฌ์ ๋ค์๊ณผ ๊ฐ์ด ์ ์ฒ๋ฆฌํจ.
๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ๋ฆฌ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'NanumGothic'
plt.rcParams['axes.unicode_minus'] = False
%load_ext watermark
%watermark -v -m -p pandas,numpy,matplotlib,seaborn
# Python version : 3.10.15
# IPython version : 8.29.0
# pandas : 2.2.3
# numpy : 1.23.5
# matplotlib: 3.9.2
# seaborn : 0.13.2
df1_raw = pd.read_html("./lotto/lotto-1.xls", encoding='euc-kr', index_col=False)
df2_raw = pd.read_html("./lotto/lotto-2.xls", encoding='euc-kr', index_col=False)
df1_table = df1_raw[1]
df2_table = df2_raw[1]
# ์ฒซ ๋ฒ์งธ ํ ์ ๊ฑฐ ํ ๋ ๋ฒ์งธ ํ์ ์ปฌ๋ผ๋ช
์ผ๋ก ์ค์
df1_table.columns = df1_table.iloc[1] # ๋ ๋ฒ์งธ ํ์ ์ปฌ๋ผ๋ช
์ผ๋ก ์ค์
df2_table.columns = df2_table.iloc[1]
df1_table = df1_table[2:].reset_index(drop=True) # ๋ถํ์ํ ํ ์ญ์
df2_table = df2_table[2:].reset_index(drop=True)
# ์ปฌ๋ผ๋ช
์ ๋ฆฌ
df1_table.columns = ['๋
๋', 'ํ์ฐจ', '์ถ์ฒจ์ผ', '1๋ฑ ๋น์ฒจ์์', '1๋ฑ ๋น์ฒจ๊ธ์ก',
'2๋ฑ ๋น์ฒจ์์', '2๋ฑ ๋น์ฒจ๊ธ์ก', '3๋ฑ ๋น์ฒจ์์', '3๋ฑ ๋น์ฒจ๊ธ์ก',
'4๋ฑ ๋น์ฒจ์์', '4๋ฑ ๋น์ฒจ๊ธ์ก', '5๋ฑ ๋น์ฒจ์์', '5๋ฑ ๋น์ฒจ๊ธ์ก',
'๋ฒํธ1', '๋ฒํธ2', '๋ฒํธ3', '๋ฒํธ4', '๋ฒํธ5', '๋ฒํธ6', '๋ณด๋์ค๋ฒํธ']
df2_table.columns = df1_table.columns # ๋์ผํ ์ปฌ๋ผ๋ช
์ ์ฉ
# ์ซ์ ๋ฐ์ดํฐ ๋ณํ
numeric_cols = ["1๋ฑ ๋น์ฒจ์์", "1๋ฑ ๋น์ฒจ๊ธ์ก", "2๋ฑ ๋น์ฒจ์์", "2๋ฑ ๋น์ฒจ๊ธ์ก",
"3๋ฑ ๋น์ฒจ์์", "3๋ฑ ๋น์ฒจ๊ธ์ก", "4๋ฑ ๋น์ฒจ์์", "4๋ฑ ๋น์ฒจ๊ธ์ก",
"5๋ฑ ๋น์ฒจ์์", "5๋ฑ ๋น์ฒจ๊ธ์ก"]
def clean_numeric_columns(df, cols):
for col in cols:
df[col] = df[col].str.replace(",", "").str.replace("์", "").astype(np.int64)
return df
df1_table = clean_numeric_columns(df1_table, numeric_cols)
df2_table = clean_numeric_columns(df2_table, numeric_cols)
# ๋ ์ง ๋ฐ์ดํฐ ๋ณํ
df1_table["์ถ์ฒจ์ผ"] = pd.to_datetime(df1_table["์ถ์ฒจ์ผ"], format="%Y.%m.%d")
df2_table["์ถ์ฒจ์ผ"] = pd.to_datetime(df2_table["์ถ์ฒจ์ผ"], format="%Y.%m.%d")
# ๋ ๊ฐ์ ๋ฐ์ดํฐํ๋ ์ ํฉ์น๊ธฐ
lotto_dataset = pd.concat([df1_table, df2_table], ignore_index=True).sort_values('๋
๋').reset_index(drop=True)
lotto_dataset['์ถ์ฒจ์ผ'] = pd.to_datetime(lotto_dataset['์ถ์ฒจ์ผ'], format="%Y-%m-%d")
lotto_obj_cols = ['๋
๋', 'ํ์ฐจ', '๋ฒํธ1', '๋ฒํธ2', '๋ฒํธ3', '๋ฒํธ4', '๋ฒํธ5', '๋ฒํธ6', '๋ณด๋์ค๋ฒํธ']
for col in lotto_obj_cols:
lotto_dataset[col] = lotto_dataset[col].astype(np.int64)
# ๋
๋ ํ์ฐจ ์ถ์ฒจ์ผ 1๋ฑ ๋น์ฒจ์์ 1๋ฑ ๋น์ฒจ๊ธ์ก 2๋ฑ ๋น์ฒจ์์ 2๋ฑ ๋น์ฒจ๊ธ์ก 3๋ฑ ๋น์ฒจ์์ 3๋ฑ ๋น์ฒจ๊ธ์ก 4๋ฑ ๋น์ฒจ์์ 4๋ฑ ๋น์ฒจ๊ธ์ก 5๋ฑ ๋น์ฒจ์์ 5๋ฑ ๋น์ฒจ๊ธ์ก ๋ฒํธ1 ๋ฒํธ2 ๋ฒํธ3 ๋ฒํธ4 ๋ฒํธ5 ๋ฒํธ6 ๋ณด๋์ค๋ฒํธ
# 0 2002 1 2002-12-07 0 0 1 143934100 28 5140500 2537 113400 40155 10000 10 23 29 33 37 40 16
# 1 2002 2 2002-12-14 1 2002006800 2 94866800 103 1842000 3763 100800 55480 10000 9 13 21 25 32 42 2
# 2 2002 3 2002-12-21 1 2000000000 0 0 139 1174100 5940 54900 73256 10000 11 16 19 21 27 31 30
# 3 2002 4 2002-12-28 0 0 1 211191200 29 7282400 2777 152100 52382 10000 14 27 30 31 40 42 2
# 4 2003 21 2003-04-26 23 797475400 26 117576500 2008 1522400 82996 73600 1155368 10000 6 12 17 18 31 32 21
# ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
# 1154 2025 1156 2025-01-25 21 1505207090 73 72167464 3494 1507792 162883 50000 2706409 5000 30 31 34 39 41 45 7
# 1155 2025 1155 2025-01-18 7 4066375179 78 60821851 3171 1496091 160366 50000 2654046 5000 10 16 19 27 37 38 13
# 1156 2025 1154 2025-01-11 15 1854965425 62 74796993 3190 1453735 154785 50000 2559114 5000 4 8 22 26 32 38 27
# 1157 2025 1153 2025-01-04 15 2027312925 79 64155473 3506 1445603 167692 50000 2848165 5000 1 9 10 13 35 44 5
# 1158 2025 1157 2025-02-01 12 2257842157 141 32026130 4549 992677 201847 50000 3100779 5000 5 7 12 20 25 26 28
# 1159 rows ร 20 columns
2. ๋ฐ์ดํฐ ๊ฐ์
ํตํฉ๋ ๋ฐ์ดํฐ์ ๋ํ ๊ตฌ์กฐ ํ์ธ
# ๋ฐ์ดํฐ ๊ตฌ์กฐ ํ์ธ
lotto_dataset.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1159 entries, 0 to 1158
# Data columns (total 20 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 ๋
๋ 1159 non-null int64
# 1 ํ์ฐจ 1159 non-null int64
# 2 ์ถ์ฒจ์ผ 1159 non-null datetime64[ns]
# 3 1๋ฑ ๋น์ฒจ์์ 1159 non-null int64
# 4 1๋ฑ ๋น์ฒจ๊ธ์ก 1159 non-null int64
# 5 2๋ฑ ๋น์ฒจ์์ 1159 non-null int64
# 6 2๋ฑ ๋น์ฒจ๊ธ์ก 1159 non-null int64
# 7 3๋ฑ ๋น์ฒจ์์ 1159 non-null int64
# 8 3๋ฑ ๋น์ฒจ๊ธ์ก 1159 non-null int64
# 9 4๋ฑ ๋น์ฒจ์์ 1159 non-null int64
# 10 4๋ฑ ๋น์ฒจ๊ธ์ก 1159 non-null int64
# 11 5๋ฑ ๋น์ฒจ์์ 1159 non-null int64
# 12 5๋ฑ ๋น์ฒจ๊ธ์ก 1159 non-null int64
# 13 ๋ฒํธ1 1159 non-null int64
# 14 ๋ฒํธ2 1159 non-null int64
# 15 ๋ฒํธ3 1159 non-null int64
# 16 ๋ฒํธ4 1159 non-null int64
# 17 ๋ฒํธ5 1159 non-null int64
# 18 ๋ฒํธ6 1159 non-null int64
# 19 ๋ณด๋์ค๋ฒํธ 1159 non-null int64
# dtypes: datetime64[ns](1), int64(19)
# memory usage: 181.2 KB
# ๊ธฐ์ ํต๊ณ ๋ถ์
round(lotto_dataset.describe(), 3)
# ๋
๋ ํ์ฐจ ์ถ์ฒจ์ผ 1๋ฑ ๋น์ฒจ์์ 1๋ฑ ๋น์ฒจ๊ธ์ก 2๋ฑ ๋น์ฒจ์์ 2๋ฑ ๋น์ฒจ๊ธ์ก 3๋ฑ ๋น์ฒจ์์ 3๋ฑ ๋น์ฒจ๊ธ์ก 4๋ฑ ๋น์ฒจ์์ 4๋ฑ ๋น์ฒจ๊ธ์ก 5๋ฑ ๋น์ฒจ์์ 5๋ฑ ๋น์ฒจ๊ธ์ก ๋ฒํธ1 ๋ฒํธ2 ๋ฒํธ3 ๋ฒํธ4 ๋ฒํธ5 ๋ฒํธ6 ๋ณด๋์ค๋ฒํธ
# count 1159.000 1159.000 1159 1159.000 1.159000e+03 1159.000 1.159000e+03 1159.000 1159.000 1159.000 1159.000 1159.000 1159.000 1159.000 1159.000 1159.000 1159.000 1159.000 1159.000 1159.000
# mean 2013.530 580.000 2014-01-11 00:00:00 8.097 2.665804e+09 49.023 6.332445e+07 1851.280 1585731.174 90760.637 57301.499 1487654.831 5375.324 6.752 13.090 19.954 26.234 32.840 39.478 22.334
# min 2002.000 1.000 2002-12-07 00:00:00 0.000 0.000000e+00 0.000 0.000000e+00 28.000 627634.000 2537.000 27300.000 40155.000 5000.000 1.000 2.000 3.000 5.000 9.000 18.000 1.000
# 25% 2008.000 290.500 2008-06-24 12:00:00 5.000 1.566595e+09 33.000 4.878907e+07 1294.000 1354515.500 64195.500 50000.000 1060499.000 5000.000 3.000 8.000 14.000 21.000 28.000 37.000 10.500
# 50% 2014.000 580.000 2014-01-11 00:00:00 7.000 2.082099e+09 44.000 5.712570e+07 1650.000 1478429.000 80303.000 50000.000 1324790.000 5000.000 5.000 12.000 19.000 27.000 34.000 41.000 23.000
# 75% 2019.000 869.500 2019-07-30 12:00:00 10.000 3.019093e+09 62.000 6.688088e+07 2339.000 1603055.000 114261.000 56207.000 1865719.500 5000.000 10.000 18.000 25.500 32.000 38.000 44.000 34.000
# max 2025.000 1159.000 2025-02-15 00:00:00 63.000 4.072296e+10 664.000 7.694565e+08 11247.000 9307100.000 703234.000 260000.000 3410846.000 10000.000 35.000 37.000 40.000 43.000 44.000 45.000 45.000
# std 6.415 334.719 NaN 4.860 2.398714e+09 28.782 3.692096e+07 784.666 641177.352 38495.307 20277.336 553517.220 1318.047 5.406 6.892 7.436 7.541 6.821 5.292 13.204
๋ฐ์ดํฐ์ ์ 1159ํร20์ด๋ก ๊ตฌ์ฑ๋์ด ์์ผ๋ฉฐ, ๋ชจ๋ ์ปฌ๋ผ์ ๊ฒฐ์ธก์น๊ฐ ์์.
1๋ฑ ๋น์ฒจ๊ธ์ก์ ๋ณด๋ฉด, ์ต๋๊ฐ์ด 400์ต์ด ๋๋๊ฒ์ ํ์ธํ ์ ์๋ค. ๊ถ๊ธํด์ 1๋ฑ ๋น์ฒจ๊ธ์ก ์์ผ๋ก ํ์ธํด ๋ดค์๋ ๋ค์๊ณผ ๊ฐ์๋ค.
lotto_dataset['1๋ฑ ๋น์ฒจ๊ธ์ก'].sort_values(ascending=False)[:10]
# 18 40722959400
# 24 24227745300
# 19 19352212800
# 42 17749630800
# 14 17014245000
# 35 16014475800
# 61 15817286400
# 32 14903517600
# 81 14562494400
# 85 14252186400
# Name: 1๋ฑ ๋น์ฒจ๊ธ์ก, dtype: int64
100์ต์ด ๋๋ 1๋ฑ ๋น์ฒจ๊ธ์ก์ด ์ด๋ฆฌ๋ ๋ง์๋๊ฐ? ํด์ ์ถ์ฒจ์ผ์ ์ฐพ์๋ณด์๋ค.
lotto_dataset.loc[lotto_dataset['1๋ฑ ๋น์ฒจ๊ธ์ก'].sort_values(ascending=False)[:10].index]['์ถ์ฒจ์ผ']
# 18 2003-04-12
# 24 2003-05-24
# 19 2003-04-19
# 42 2003-09-27
# 14 2003-03-15
# 35 2003-08-09
# 61 2004-02-07
# 32 2003-07-19
# 81 2004-06-26
# 85 2004-07-24
# Name: ์ถ์ฒจ์ผ, dtype: datetime64[ns]
2003 ~ 2004๋ ์ 100์ต์ด ๋๋ 1๋ฑ ๋น์ฒจ๊ธ์ ๋ฐ๋ ์ผ์ด ๋ง์๋ ๊ฒ์ ํ์ธํ ์ ์์๋ค.
3. ์๊ฐํ ๋ถ์
(1) ์ถ์ฒจ์ผ์ ๋ฐ๋ฅธ 1๋ฑ ๋น์ฒจ๊ธ์ก & ๋น์ฒจ์์ ๋ณํ ๋ฐ ์ ํ ์ถ์ธ์
import statsmodels.api as sm
# ์๊ฐ ํ๋ฆ(์ธ๋ฑ์ค)์ ํ๊ท๋ถ์์ ๋
๋ฆฝ๋ณ์๋ก ์ฌ์ฉ
lotto_dataset['ํ์ฐจ์ธ๋ฑ์ค'] = np.arange(len(lotto_dataset))
# (1) 1๋ฑ ๋น์ฒจ๊ธ์ก์ ๋ํ ์ ํ ํ๊ท
X = lotto_dataset[['ํ์ฐจ์ธ๋ฑ์ค']]
X = sm.add_constant(X) # ์์ํญ ์ถ๊ฐ
y = lotto_dataset['1๋ฑ ๋น์ฒจ๊ธ์ก']
model = sm.OLS(y, X).fit()
lotto_dataset['๊ธ์ก_์ถ์ธ์ '] = model.predict(X)
# (2) 1๋ฑ ๋น์ฒจ์์์ ๋ํ ์ ํ ํ๊ท
X2 = lotto_dataset[['ํ์ฐจ์ธ๋ฑ์ค']]
X2 = sm.add_constant(X2)
y2 = lotto_dataset['1๋ฑ ๋น์ฒจ์์']
model2 = sm.OLS(y2, X2).fit()
lotto_dataset['๋น์ฒจ์์_์ถ์ธ์ '] = model2.predict(X2)
# ์๊ฐํ
fig, axes = plt.subplots(2, 1, figsize=(16, 10), sharex=True)
# 1๋ฑ ๋น์ฒจ๊ธ์ก (์ ๋ฐ์ดํฐ + ํ๊ท์ )
axes[0].plot(lotto_dataset['์ถ์ฒจ์ผ'], lotto_dataset['1๋ฑ ๋น์ฒจ๊ธ์ก'], label='1๋ฑ ๋น์ฒจ๊ธ์ก(์)', color='blue')
axes[0].plot(lotto_dataset['์ถ์ฒจ์ผ'], lotto_dataset['๊ธ์ก_์ถ์ธ์ '], label='์ถ์ธ์ (์ ํ ํ๊ท)', color='red')
axes[0].set_ylabel('1๋ฑ ๋น์ฒจ๊ธ์ก(์)')
axes[0].set_title('์ถ์ฒจ์ผ์ ๋ฐ๋ฅธ 1๋ฑ ๋น์ฒจ๊ธ์ก ๋ณํ ๋ฐ ์ ํ ์ถ์ธ์ ')
axes[0].grid(True)
axes[0].legend(loc='upper center', fontsize=15)
# 1๋ฑ ๋น์ฒจ์์ (์ ๋ฐ์ดํฐ + ํ๊ท์ )
axes[1].plot(lotto_dataset['์ถ์ฒจ์ผ'], lotto_dataset['1๋ฑ ๋น์ฒจ์์'], label='1๋ฑ ๋น์ฒจ์์(๋ช
)', color='orange')
axes[1].plot(lotto_dataset['์ถ์ฒจ์ผ'], lotto_dataset['๋น์ฒจ์์_์ถ์ธ์ '], label='์ถ์ธ์ (์ ํ ํ๊ท)', color='red')
axes[1].set_xlabel('์ถ์ฒจ์ผ')
axes[1].set_ylabel('1๋ฑ ๋น์ฒจ์์(๋ช
)')
axes[1].set_title('์ถ์ฒจ์ผ์ ๋ฐ๋ฅธ 1๋ฑ ๋น์ฒจ์์ ๋ณํ ๋ฐ ์ ํ ์ถ์ธ์ ')
axes[1].grid(True)
axes[1].legend(loc='upper center', fontsize=15)
plt.tight_layout()
plt.show()

1๋ฑ ๋น์ฒจ๊ธ์ก์ ๊ฐ์ํ๋ ์ถ์ธ๊ณ , 1๋ฑ ๋น์ฒจ์๋ ์ฆ๊ฐํ๋ ์ถ์ธ๋ผ๋ ๊ฒ์ ํ์ธํ ์ ์์๋ค. ๋น์ฒจ์์๊ฐ ์ฆ๊ฐํจ์ ๋ฐ๋ผ 1๋ฑ ๋น์ฒจ ๊ธ์ก์ด ์ค์ด๋ ๊ฒ์ผ๋ก ๋ณด์ธ๋ค. ๋ณต๊ถ ๊ตฌ๋งค๋(ํ๋งค๋)์ ๋ํ ๋ฐ์ดํฐ๊ฐ ์์ด์ ๊ตฌ๋งค์ ๋๋น ๋น์ฒจ๋ฅ ์ ํ์ธํ ์ ์์๋ค.
(2) ๋ก๋ ๋ฒํธ๋ณ ์ถ๋ณ ๋น๋
from collections import Counter
palette = sns.color_palette("rocket", 45)
# ๋น์ฒจ ๋ฒํธ ๋ฐ์ดํฐ ์์ง
all_numbers = lotto_dataset[['๋ฒํธ1', '๋ฒํธ2', '๋ฒํธ3', '๋ฒํธ4', '๋ฒํธ5', '๋ฒํธ6', '๋ณด๋์ค๋ฒํธ']].values.flatten()
all_but_no_bonus_numbers = lotto_dataset[['๋ฒํธ1', '๋ฒํธ2', '๋ฒํธ3', '๋ฒํธ4', '๋ฒํธ5', '๋ฒํธ6', '๋ณด๋์ค๋ฒํธ']].values.flatten()
only_bonus_numbers = lotto_dataset['๋ณด๋์ค๋ฒํธ'].values.flatten()
# ๋ฒํธ๋ณ ์ถํ ๋น๋ ๊ณ์ฐ
number_counts = Counter(all_numbers)
all_but_no_bonus_counts = Counter(all_but_no_bonus_numbers)
only_bonus_numbers_counts = Counter(only_bonus_numbers)
# Counter์ key๋ฅผ ๋ด๋ฆผ์ฐจ์ ์ ๋ ฌ
sorted_items = sorted(number_counts.items(), key=lambda x: x[0], reverse=True)
no_bonus_sorted_items = sorted(all_but_no_bonus_counts.items(), key=lambda x: x[0], reverse=True)
bonus_sorted_items = sorted(only_bonus_numbers_counts.items(), key=lambda x: x[0], reverse=True)
sorted_keys, sorted_values = zip(*sorted_items)
no_bonus_sorted_keys, no_bonus_sorted_values = zip(*no_bonus_sorted_items)
bonus_sorted_keys, bonus_sorted_values = zip(*bonus_sorted_items)
# ์๊ฐํ
fig, axes = plt.subplots(3, 1, figsize=(16, 10), sharex=True)
# ์๊ฐํ
axes[0].bar(sorted_keys, sorted_values, color=palette)
axes[0].set_ylabel("์ถํ ํ์")
axes[0].set_title("๋ก๋ ๋ฒํธ๋ณ ์ถํ ๋น๋(๋ฒํธ ์ ์ฒด)")
axes[0].set_ylim(140, 210)
axes[0].grid(axis="y", linestyle="--", alpha=0.8)
axes[1].bar(no_bonus_sorted_keys, no_bonus_sorted_values, color=palette)
axes[1].set_ylabel("์ถํ ํ์")
axes[1].set_title("๋ก๋ ๋ฒํธ๋ณ ์ถํ ๋น๋(๋ณด๋์ค ๋ฒํธ ์ ์ธ)")
axes[1].set_ylim(140, 210)
axes[1].grid(axis="y", linestyle="--", alpha=0.8)
axes[2].bar(bonus_sorted_keys, bonus_sorted_values, color=palette)
axes[2].set_xlabel("๋ก๋ ๋ฒํธ")
axes[2].set_ylabel("์ถํ ํ์")
axes[2].set_title("๋ก๋ ๋ฒํธ๋ณ ์ถํ ๋น๋(๋ณด๋์ค ๋ฒํธ)")
axes[2].set_xticks(range(1, 46))
axes[2].set_ylim(10, 40)
axes[2].grid(axis="y", linestyle="--", alpha=0.8)
plt.tight_layout()

1๋ฑ ๋น์ฒจ ๋ฒํธ๋ก ๊ฐ์ฅ ๋ง์ด ์ ์ ๋ ์ซ์๋ '34', ๊ฐ์ฅ ์ ๊ฒ ์ ์ ๋ ์ซ์๋ '9'๋ผ๋ ๊ฒ์ ํ์ธํ ์ ์์๋ค. ๋ณด๋์ค ๋ฒํธ๋ฅผ ์ ์ธํ๊ณ ์๋ ๊ฐ์ฅ ๋ง์ด ์ ์ ๋ ์ซ์๋ '34,', ๊ฐ์ฅ ์ ๊ฒ ์ ์ ๋ ์ซ์๋ '9'์๋ค. ์ด์ ๋ฌ๋ฆฌ, ๋ณด๋์ค ๋ฒํธ ์ค์์๋ '43'์ด ๊ฐ์ฅ ๋ง์ด ์ ์ ๋์ผ๋ฉฐ '29'๊ฐ ๊ฐ์ฅ ์ ๊ฒ ์ ์ ๋๋ค.
(3) ๊ฐ์ฅ ๋ง์ด ๋ฑ์ฅํ ๋ก๋ ์ซ์ ์กฐํฉ (TOP 10)
from itertools import combinations
# ๋ชจ๋ ์กฐํฉ์ ๋ฑ์ฅ ํ์๋ฅผ ์ ์ฅํ Counter ์์ฑ
combination_counter = Counter()
# ๊ฐ๋ฅํ ์กฐํฉ ํฌ๊ธฐ (2๊ฐ ์กฐํฉ๋ถํฐ 6๊ฐ ์กฐํฉ๊น์ง)
for r in range(2, 7):
for row in lotto_dataset[['๋ฒํธ1', '๋ฒํธ2', '๋ฒํธ3', '๋ฒํธ4', '๋ฒํธ5', '๋ฒํธ6', '๋ณด๋์ค๋ฒํธ']].values:
# ๊ฐ ํ์ฐจ์ ๋น์ฒจ ๋ฒํธ์์ r๊ฐ์ ์ซ์๋ก ์ด๋ฃจ์ด์ง ์กฐํฉ ์์ฑ ํ ์ ๋ ฌํ์ฌ ์นด์ดํธ
for combo in combinations(sorted(row), r):
combination_counter[combo] += 1
# ๊ฐ์ฅ ๋ง์ด ๋ฑ์ฅํ ์กฐํฉ 10๊ฐ ์ฐพ๊ธฐ
most_common_combinations = combination_counter.most_common(10)
# ๋ฐ์ดํฐ ๋ณํ: ์กฐํฉ์ ๋ฌธ์์ด๋ก ๋ณํํ์ฌ ์๊ฐํ
combo_labels = [str(combo) for combo, _ in most_common_combinations]
combo_counts = [count for _, count in most_common_combinations]
# ์๊ฐํ
plt.figure(figsize=(12, 6))
sns.barplot(x=combo_counts, y=combo_labels, palette="Blues_r")
plt.xlim(32,42)
plt.xlabel("๋ฑ์ฅ ํ์")
plt.ylabel("์ซ์ ์กฐํฉ")
plt.title("๊ฐ์ฅ ๋ง์ด ๋ฑ์ฅํ ๋ก๋ ์ซ์ ์กฐํฉ (TOP 10)")
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.tight_layout()

๋ณด๋์ค ๋ฒํธ๋ฅผ ํฌํจํ์ฌ 2~6๊ฐ๊น์ง ๊ฐ๋ฅํ ์กฐํฉ์ ์กฐํํ ๊ฒฐ๊ณผ, ๊ฐ์ฅ ๋ง์ด ๋ฑ์ฅํ ์ซ์ ์กฐํฉ์ (3,20)์ด์๋ค.
์ด๋ฒ ๊ธ์์๋ ์ฌ๊ธฐ๊น์ง ์งํํ๊ณ , ๋ค์ ๊ธ์์ ์ถ๊ฐ๋ก ์งํํ๊ฒ ๋ค.