比赛结果预测_决策树_随机森林(通用 数据挖掘入门与实践-实验5)
生活随笔
收集整理的這篇文章主要介紹了
比赛结果预测_决策树_随机森林(通用 数据挖掘入门与实践-实验5)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
#數據導入
import pandas as pddata_filename="datasets.csv"
dataset=pd.read_csv(data_filename)
#dataset.loc[:5]#數據清洗
dataset=pd.read_csv(data_filename, parse_dates=["Date"])
#dataset.loc[:5]#方法1:上場比賽情況特征提取,并用其進行預測#主隊獲勝情況提取
from collections import defaultdictdataset["HomeWin"]=dataset["VisitorPTS"]<dataset["HomePTS"]
y_true=dataset["HomeWin"].values#隊伍上一次比賽情況記錄
won_last=defaultdict(int)#HomeLastWin && VisitorLastWint填充
dataset["HomeLastWin"]=0
dataset["VisitorLastWin"]=0
for index, row in dataset.sort_values("Date").iterrows():home_team=row["Home Team"]visitor_team=row["Visitor Team"]row["HomeLastWin"]=won_last[home_team]row["VisitorLastWin"]=won_last[visitor_team]dataset.loc[index]=rowwon_last[home_team]=int(row["HomeWin"])won_last[visitor_team]=1-int(row["HomeWin"])#決策樹 交叉驗證
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
clf=DecisionTreeClassifier(random_state=14)
X_previouswins=dataset[["HomeLastWin","VisitorLastWin"]].valuesscores=cross_val_score(clf,X_previouswins,y_true,scoring="accuracy")
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))#方法2:上場比賽情況 && 球隊排名用于預測#雙方相對排名提取
standings = pd.read_csv("standings.csv")
dataset["HomeTeamRanksHigher"]=0
for index,row in dataset.iterrows():home_team=row["Home Team"]visitor_team=row["Visitor Team"]home_rank=standings[standings["Team"]==home_team]["Rk"].values[0]visitor_rank=standings[standings["Team"]==visitor_team]["Rk"].values[0]row["HomeTeamRankingHigher"]=int(home_rank>visitor_rank)# 決策樹 交叉驗證
X_homehigher = dataset[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
clf=DecisionTreeClassifier(random_state=14)scores=cross_val_score(clf,X_homehigher,y_true,scoring="accuracy")
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))# 方法3 忽略主客場,記錄最后的勝利last_match_winner = defaultdict(int)
dataset["HomeTeamWonLast"] = 0 for index, row in dataset.iterrows():home_team = row["Home Team"]visitor_team = row["Visitor Team"]teams = tuple(sorted([home_team, visitor_team]))row["HomeTeamWonLast"] = 1 if last_match_winner[teams] == home_team else 0dataset.loc[index] = rowwinner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]last_match_winner[teams] = winner# 決策樹 交叉驗證
clf=DecisionTreeClassifier(random_state=14)
X_lastwinner=dataset[["HomeLastWin","VisitorLastWin"]].valuesscores=cross_val_score(clf,X_lastwinner,y_true,scoring="accuracy")
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))#方法4 球隊姓名數字化from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()#轉換為整型
encoding.fit(dataset["Home Team"].values)#球隊名數據接收
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)#拼接
X_teams = np.vstack([home_teams, visitor_teams]).T#print(X_teams)#獨熱編碼
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
X_teams_expanded = onehot.fit_transform(X_teams).todense()clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams_expanded, y_true, scoring='accuracy')
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))#隨機森林# 僅用隊名
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100)) # 隊名 && 上一場勝者
#X_previouswins
X_all = np.hstack([X_previouswins, X_teams])
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))# GridSearchCV搜索最佳參數
from sklearn.model_selection import GridSearchCVparameter_space = { "max_features": [2, 10, 'auto'], "n_estimators": [100,], "criterion": ["gini", "entropy"], "min_samples_leaf": [2, 4, 6],
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
#print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))#隨機森林 正確率最高的模型參數
clf = RandomForestClassifier(bootstrap=True,criterion='entropy', max_depth=None, max_features=2,max_leaf_nodes=None, min_samples_leaf=6,min_samples_split=2, n_estimators=100, n_jobs=1,oob_score=False, random_state=14, verbose=0)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
總結
以上是生活随笔為你收集整理的比赛结果预测_决策树_随机森林(通用 数据挖掘入门与实践-实验5)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Leetcode题库 2038.邻色同删
- 下一篇: 机器学习 KNN算法_0 丐版(matl