
๋ก๋ ๋ฐ์ดํฐ ๋ถ์์ ํตํด ๋น์ฒจ ๋ฒํธ์ ํจํด์ ์ฐพ์๋ณด๊ณ , XGBoost๋ฅผ ํ์ฉํ์ฌ ๋ค์ ๋น์ฒจ ๋ฒํธ๋ฅผ ์์ธกํ๋ ๋ฐฉ๋ฒ์ ์๊ฐํ๋ค. ์ด๋ฒ ๋ถ์์์๋ ํ์/์ง์ ๋น์จ, ๋ฎ์ ์ซ์ vs ๋์ ์ซ์ ๋น์จ, ์๋ณ ๋น์ฒจ ๋ฒํธ ๋ถ์ ๋ฑ์ ์งํํ๊ณ , ๋จธ์ ๋ฌ๋ ๋ชจ๋ธ์ ํ์ฉํด ๋ฒํธ๋ฅผ ์์ธกํด๋ณผ ๊ฒ์ด๋ค.
1. ํ์ vs ์ง์ and ๋ฎ์ ์ซ์(1~22) vs ๋์ ์ซ์(23~45) ๋น์จ ๋ถ์
๋ก๋ ๋น์ฒจ ๋ฒํธ์์ ํ์์ ์ง์์ ์ถํ ๋น์จ์ ๋ถ์ํ๋ค.
def odd_even_ratio(df):
odd_even_counts = {"ํ์": 0, "์ง์": 0}
for i in range(1, 7):
odd_even_counts["ํ์"] += (df[f"๋ฒํธ{i}"] % 2 == 1).sum()
odd_even_counts["์ง์"] += (df[f"๋ฒํธ{i}"] % 2 == 0).sum()
return odd_even_counts
def low_high_ratio(df):
low_high_counts = {"๋ฎ์ ์ซ์ (1-22)": 0, "๋์ ์ซ์ (23-45)": 0}
for i in range(1, 7):
low_high_counts["๋ฎ์ ์ซ์ (1-22)"] += (df[f"๋ฒํธ{i}"] <= 22).sum()
low_high_counts["๋์ ์ซ์ (23-45)"] += (df[f"๋ฒํธ{i}"] > 22).sum()
return low_high_counts
์ด ๋ฐ์ดํฐ๋ฅผ ํ์ฉํ์ฌ ๊ทธ๋ํ๋ฅผ ์๊ฐํํ๋ค.
# ๋ถ์ ์คํ
odd_even_counts = odd_even_ratio(lotto_dataset)
low_high_counts = low_high_ratio(lotto_dataset)
# ์๊ฐํ
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# ํ์ vs ์ง์ ๋น์จ
sns.barplot(x=list(odd_even_counts.keys()), y=list(odd_even_counts.values()), palette="coolwarm", ax=axes[0])
axes[0].set_title("ํ์ vs ์ง์ ๋น์จ", fontsize=14)
axes[0].set_ylabel("์ถํ ํ์")
axes[0].set_ylim(3200, 3600)
axes[0].grid(axis="y", linestyle="--", alpha=0.8)
# ๋ฎ์ ์ซ์ vs ๋์ ์ซ์ ๋น์จ
sns.barplot(x=list(low_high_counts.keys()), y=list(low_high_counts.values()), palette="viridis", ax=axes[1])
axes[1].set_title("๋ฎ์ ์ซ์ (1-22) vs ๋์ ์ซ์ (23-45) ๋น์จ", fontsize=14)
axes[1].set_ylabel("์ถํ ํ์")
axes[1].set_ylim(3200, 3600)
axes[1].grid(axis="y", linestyle="--", alpha=0.8)
plt.tight_layout()
plt.show()

ํ์๊ฐ ์ง์๋ณด๋ค ๋น๊ต์ ๋น์ถค ํ์๊ฐ ๋ ๋ง์์ผ๋ฉฐ, ๋ฎ์ ์ซ์๋ณด๋ค ๋์ ์ซ์๊ฐ ๋ ๋ง์ด ๋น์ฒจ๋๋ค.
2. ์๋ณ ๊ฐ์ฅ ๋ง์ด ๋์จ ๋ฒํธ ๋ถ์
palette = sns.color_palette("rocket", 12)
# ์ ์ ๋ณด ์ถ๊ฐ
lotto_dataset["์"] = lotto_dataset["์ถ์ฒจ์ผ"].dt.month
# ๋ฒํธ ์ปฌ๋ผ ๋ฆฌ์คํธ
number_columns = ["๋ฒํธ1", "๋ฒํธ2", "๋ฒํธ3", "๋ฒํธ4", "๋ฒํธ5", "๋ฒํธ6"]
# ์๋ณ ๋ฒํธ ์ถํ ํ์ ๊ณ์ฐ
monthly_number_counts = {month: {} for month in range(1, 13)}
for month in range(1, 13):
monthly_data = lotto_dataset[lotto_dataset["์"] == month]
number_counts = monthly_data[number_columns].values.flatten()
unique, counts = np.unique(number_counts, return_counts=True)
monthly_number_counts[month] = dict(zip(unique, counts))
# ์๋ณ ๊ฐ์ฅ ๋ง์ด ๋์จ ๋ฒํธ ์ฐพ๊ธฐ
most_frequent_numbers = {
month: max(monthly_number_counts[month], key=monthly_number_counts[month].get)
for month in range(1, 13)
}
# ์๋ณ ๊ฐ์ฅ ๋ง์ด ๋์จ ๋ฒํธ์ ์ถํ ํ์
most_frequent_counts = {
month: monthly_number_counts[month][most_frequent_numbers[month]]
for month in range(1, 13)
}
# ์๊ฐํ
plt.figure(figsize=(12, 6))
plt.bar(most_frequent_numbers.keys(), most_frequent_counts.values(), tick_label=list(most_frequent_numbers.keys()), color=palette)
plt.xlabel("์")
plt.ylabel("๋น์ฒจ ํ์")
plt.title("์๋ณ ๊ฐ์ฅ ๋ง์ด ๋น์ฒจ๋ ๋ก๋ ๋ฒํธ์ ๋น์ฒจ ํ์")
plt.xticks(range(1, 13))
plt.ylim(10,30)
plt.grid(axis='y', linestyle='--', alpha=0.7)
# ๋ง๋ ์์ ๋ฒํธ ํ์
for month, num in most_frequent_numbers.items():
plt.text(month, most_frequent_counts[month], str(num), ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.show()

์๋ณ๋ก ๋ง์ด ๋น์ฒจ๋ ๋ฒํธ ํ ๊ฐ๋ง ํ์ธํด ๋ณด์์๋ ์์ ํ์ ๊ฐ์ด ๋์๋ค. ์ ์ผ ์ ๋์ฌ๊ฒ ๊ฐ์๋ 1์ด 6์๊ณผ 10์ ๋ ๊ฐ์ ๋ฌ์ ๋์จ๊ฒ ์ข ์ ๊ธฐํ๋ค.
4. XGBoost๋ฅผ ํ์ฉํ ๋ก๋ ๋ฒํธ ์์ธก
๋ก๋ ๋ฒํธ ์์ธก์ ์ํด XGBoost๋ฅผ ํ์ฉํ๋ค. ๊ฐ ๋ฒํธ(1~45)์ ์ถํ ์ฌ๋ถ๋ฅผ ํ์ตํ๋ ๋ฐฉ์์ผ๋ก ์งํํ๋ค.
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# ๋ฐ์ดํฐ ์ค๋น
lotto_dataset["์"] = lotto_dataset["์ถ์ฒจ์ผ"].dt.month
lotto_dataset["๋
๋"] = lotto_dataset["์ถ์ฒจ์ผ"].dt.year
# ์ถํ ์ฌ๋ถ ๋ฐ์ดํฐ ์์ฑ
for i in range(1, 46):
lotto_dataset[f"๋ฒํธ_{i}"] = lotto_dataset[["๋ฒํธ1", "๋ฒํธ2", "๋ฒํธ3", "๋ฒํธ4", "๋ฒํธ5", "๋ฒํธ6"]].apply(lambda row: int(i in row.values), axis=1)
features = ["๋
๋", "์", "ํ์ฐจ"]
X = lotto_dataset[features]
y = lotto_dataset[[f"๋ฒํธ_{i}" for i in range(1, 46)]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# ๋ชจ๋ธ ํ์ต
model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)
model.fit(X_train_scaled, y_train)
# ์์ธก ์ํ
y_pred = model.predict(X_test_scaled)
# ๋ค์ ํ์ฐจ ์์ธก
def predict_next_draw(model, last_draw):
last_draw_scaled = scaler.transform([last_draw])
probabilities = model.predict_proba(last_draw_scaled)
predicted_numbers = np.argsort(probabilities[0])[-6:] + 1 # ํ๋ฅ ์ด ๋์ 6๊ฐ ๋ฒํธ ์ ํ
return sorted(predicted_numbers)
latest_data = lotto_dataset[features].iloc[-1].values
predicted_numbers = predict_next_draw(model, latest_data)
print(f"์์ธก๋ ๋ก๋ ๋ฒํธ: {predicted_numbers}")
# ๋ชจ๋ธ ์ ํ๋: 0.0560
# ์์ธก๋ ๋ก๋ ๋ฒํธ: [25, 26, 27, 32, 38, 39]
๊ฒฐ๋ก
์ด๋ฒ ๋ถ์์์๋ ๋ก๋ ๋ฒํธ์ ํจํด์ ๋ถ์ํ๊ณ , XGBoost๋ฅผ ํ์ฉํ์ฌ ๋ฒํธ๋ฅผ ์์ธกํ๋ ๋ชจ๋ธ์ ๊ตฌํํด๋ณด์๋ค. ์ ํ๋(Accuracy)๊ฐ ๊ฒจ์ฐ 5.6%(0.056)์ด๋ฏ๋ก ํ์ ํ ๋ฎ์ ์ ํ๋๋ฅผ ๋ณด์ด์ง๋ง, ์ฌ๋ฌ๊ฐ์ง ๊ธฐ๋ฒ์ ์ฌ์ฉํด ๋ช%๊น์ง ์ฌ๋ฆด ์ ์๋์ง ํ ์คํธํ ํ ๋ค์ ๊ธ์์ ์์ฑํด๋ณด๋๋ก ํ๊ฒ ๋ค.