kaggle研究生招生(中)
上次將數據訓練了模型
由于數據中的大多數候選人都有70%以上的機會,許多不成功的候選人都沒有很好的預測。
df["Chance of Admit"].plot(kind = 'hist',bins = 200,figsize = (6,6)) plt.title("Chance of Admit") plt.xlabel("Chance of Admit") plt.ylabel("Frequency") plt.show()
為分類準備數據
如果候選人的錄取機會大于80%,則該候選人將獲得1個標簽。
如果候選人的錄取機會小于或等于80%,則該候選人將獲得0標簽。
邏輯回歸
from sklearn.linear_model import LogisticRegression lrc = LogisticRegression() lrc.fit(x_train,y_train_01) print("score: ", lrc.score(x_test,y_test_01)) print("real value of y_test_01[1]: " + str(y_test_01[1]) + " -> the predict: " + str(lrc.predict(x_test.iloc[[1],:]))) print("real value of y_test_01[2]: " + str(y_test_01[2]) + " -> the predict: " + str(lrc.predict(x_test.iloc[[2],:])))# confusion matrix from sklearn.metrics import confusion_matrix cm_lrc = confusion_matrix(y_test_01,lrc.predict(x_test)) # print("y_test_01 == 1 :" + str(len(y_test_01[y_test_01==1]))) # 29# cm visualization import seaborn as sns import matplotlib.pyplot as plt f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_lrc,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.title("Test for Test Dataset") plt.xlabel("predicted y values") plt.ylabel("real y values") plt.show()from sklearn.metrics import precision_score, recall_score print("precision_score: ", precision_score(y_test_01,lrc.predict(x_test))) print("recall_score: ", recall_score(y_test_01,lrc.predict(x_test)))from sklearn.metrics import f1_score print("f1_score: ",f1_score(y_test_01,lrc.predict(x_test)))score: 0.9
real value of y_test_01[1]: 0 -> the predict: [0]
real value of y_test_01[2]: 1 -> the predict: [1]
precision_score: 0.9565217391304348
recall_score: 0.7586206896551724
f1_score: 0.8461538461538461
Test for Train Dataset:
cm_lrc_train = confusion_matrix(y_train_01,lrc.predict(x_train)) f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_lrc_train,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.xlabel("predicted y values") plt.ylabel("real y values") plt.title("Test for Train Dataset") plt.show()SVC
from sklearn.svm import SVC svm = SVC(random_state = 1) svm.fit(x_train,y_train_01) print("score: ", svm.score(x_test,y_test_01)) print("real value of y_test_01[1]: " + str(y_test_01[1]) + " -> the predict: " + str(svm.predict(x_test.iloc[[1],:]))) print("real value of y_test_01[2]: " + str(y_test_01[2]) + " -> the predict: " + str(svm.predict(x_test.iloc[[2],:])))# confusion matrix from sklearn.metrics import confusion_matrix cm_svm = confusion_matrix(y_test_01,svm.predict(x_test)) # print("y_test_01 == 1 :" + str(len(y_test_01[y_test_01==1]))) # 29# cm visualization import seaborn as sns import matplotlib.pyplot as plt f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_svm,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.title("Test for Test Dataset") plt.xlabel("predicted y values") plt.ylabel("real y values") plt.show()from sklearn.metrics import precision_score, recall_score print("precision_score: ", precision_score(y_test_01,svm.predict(x_test))) print("recall_score: ", recall_score(y_test_01,svm.predict(x_test)))from sklearn.metrics import f1_score print("f1_score: ",f1_score(y_test_01,svm.predict(x_test)))score: 0.9
real value of y_test_01[1]: 0 -> the predict: [0]
real value of y_test_01[2]: 1 -> the predict: [1]
precision_score: 0.9565217391304348
recall_score: 0.7586206896551724
f1_score: 0.8461538461538461
Test for Train Dataset
cm_svm_train = confusion_matrix(y_train_01,svm.predict(x_train)) f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_svm_train,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.xlabel("predicted y values") plt.ylabel("real y values") plt.title("Test for Train Dataset") plt.show()樸素貝葉斯
from sklearn.naive_bayes import GaussianNB nb = GaussianNB() nb.fit(x_train,y_train_01) print("score: ", nb.score(x_test,y_test_01)) print("real value of y_test_01[1]: " + str(y_test_01[1]) + " -> the predict: " + str(nb.predict(x_test.iloc[[1],:]))) print("real value of y_test_01[2]: " + str(y_test_01[2]) + " -> the predict: " + str(nb.predict(x_test.iloc[[2],:])))# confusion matrix from sklearn.metrics import confusion_matrix cm_nb = confusion_matrix(y_test_01,nb.predict(x_test)) # print("y_test_01 == 1 :" + str(len(y_test_01[y_test_01==1]))) # 29 # cm visualization import seaborn as sns import matplotlib.pyplot as plt f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_nb,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.title("Test for Test Dataset") plt.xlabel("predicted y values") plt.ylabel("real y values") plt.show()from sklearn.metrics import precision_score, recall_score print("precision_score: ", precision_score(y_test_01,nb.predict(x_test))) print("recall_score: ", recall_score(y_test_01,nb.predict(x_test)))from sklearn.metrics import f1_score print("f1_score: ",f1_score(y_test_01,nb.predict(x_test)))score: 0.9625
real value of y_test_01[1]: 0 -> the predict: [0]
real value of y_test_01[2]: 1 -> the predict: [1]
precision_score: 0.9333333333333333
recall_score: 0.9655172413793104
f1_score: 0.9491525423728815
Test for Train Dataset:
cm_nb_train = confusion_matrix(y_train_01,nb.predict(x_train)) f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_nb_train,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.xlabel("predicted y values") plt.ylabel("real y values") plt.title("Test for Train Dataset") plt.show()決策樹
from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() dtc.fit(x_train,y_train_01) print("score: ", dtc.score(x_test,y_test_01)) print("real value of y_test_01[1]: " + str(y_test_01[1]) + " -> the predict: " + str(dtc.predict(x_test.iloc[[1],:]))) print("real value of y_test_01[2]: " + str(y_test_01[2]) + " -> the predict: " + str(dtc.predict(x_test.iloc[[2],:])))# confusion matrix from sklearn.metrics import confusion_matrix cm_dtc = confusion_matrix(y_test_01,dtc.predict(x_test)) # print("y_test_01 == 1 :" + str(len(y_test_01[y_test_01==1]))) # 29# cm visualization import seaborn as sns import matplotlib.pyplot as plt f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_dtc,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.title("Test for Test Dataset") plt.xlabel("predicted y values") plt.ylabel("real y values") plt.show()from sklearn.metrics import precision_score, recall_score print("precision_score: ", precision_score(y_test_01,dtc.predict(x_test))) print("recall_score: ", recall_score(y_test_01,dtc.predict(x_test)))from sklearn.metrics import f1_score print("f1_score: ",f1_score(y_test_01,dtc.predict(x_test)))score: 0.9375
real value of y_test_01[1]: 0 -> the predict: [0]
real value of y_test_01[2]: 1 -> the predict: [1]
precision_score: 0.9615384615384616
recall_score: 0.8620689655172413
f1_score: 0.9090909090909091
Test for Train Dataset
cm_dtc_train = confusion_matrix(y_train_01,dtc.predict(x_train)) f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_dtc_train,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.xlabel("predicted y values") plt.ylabel("real y values") plt.title("Test for Train Dataset") plt.show()隨機森林
from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(n_estimators = 100,random_state = 1) rfc.fit(x_train,y_train_01) print("score: ", rfc.score(x_test,y_test_01)) print("real value of y_test_01[1]: " + str(y_test_01[1]) + " -> the predict: " + str(rfc.predict(x_test.iloc[[1],:]))) print("real value of y_test_01[2]: " + str(y_test_01[2]) + " -> the predict: " + str(rfc.predict(x_test.iloc[[2],:])))# confusion matrix from sklearn.metrics import confusion_matrix cm_rfc = confusion_matrix(y_test_01,rfc.predict(x_test)) # print("y_test_01 == 1 :" + str(len(y_test_01[y_test_01==1]))) # 29 # cm visualization import seaborn as sns import matplotlib.pyplot as plt f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_rfc,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.title("Test for Test Dataset") plt.xlabel("predicted y values") plt.ylabel("real y values") plt.show()from sklearn.metrics import precision_score, recall_score print("precision_score: ", precision_score(y_test_01,rfc.predict(x_test))) print("recall_score: ", recall_score(y_test_01,rfc.predict(x_test)))from sklearn.metrics import f1_score print("f1_score: ",f1_score(y_test_01,rfc.predict(x_test)))score: 0.9375
real value of y_test_01[1]: 0 -> the predict: [0]
real value of y_test_01[2]: 1 -> the predict: [1]
precision_score: 0.9615384615384616
recall_score: 0.8620689655172413
f1_score: 0.9090909090909091
Test for Train Dataset
cm_rfc_train = confusion_matrix(y_train_01,rfc.predict(x_train)) f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_rfc_train,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.xlabel("predicted y values") plt.ylabel("real y values") plt.title("Test for Train Dataset") plt.show()kNN
from sklearn.neighbors import KNeighborsClassifier# finding k value scores = [] for each in range(1,50):knn_n = KNeighborsClassifier(n_neighbors = each)knn_n.fit(x_train,y_train_01)scores.append(knn_n.score(x_test,y_test_01))plt.plot(range(1,50),scores) plt.xlabel("k") plt.ylabel("accuracy") plt.show()knn = KNeighborsClassifier(n_neighbors = 3) # n_neighbors = k knn.fit(x_train,y_train_01) print("score of 3 :",knn.score(x_test,y_test_01)) print("real value of y_test_01[1]: " + str(y_test_01[1]) + " -> the predict: " + str(knn.predict(x_test.iloc[[1],:]))) print("real value of y_test_01[2]: " + str(y_test_01[2]) + " -> the predict: " + str(knn.predict(x_test.iloc[[2],:])))# confusion matrix from sklearn.metrics import confusion_matrix cm_knn = confusion_matrix(y_test_01,knn.predict(x_test)) # print("y_test_01 == 1 :" + str(len(y_test_01[y_test_01==1]))) # 29# cm visualization import seaborn as sns import matplotlib.pyplot as plt f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_knn,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.title("Test for Test Dataset") plt.xlabel("predicted y values") plt.ylabel("real y values") plt.show()from sklearn.metrics import precision_score, recall_score print("precision_score: ", precision_score(y_test_01,knn.predict(x_test))) print("recall_score: ", recall_score(y_test_01,knn.predict(x_test)))from sklearn.metrics import f1_score print("f1_score: ",f1_score(y_test_01,knn.predict(x_test)))
score of 3 : 0.9375
real value of y_test_01[1]: 0 -> the predict: [0]
real value of y_test_01[2]: 1 -> the predict: [1]
precision_score: 0.9285714285714286
recall_score: 0.896551724137931
f1_score: 0.912280701754386
Test for Train Dataset:
cm_knn_train = confusion_matrix(y_train_01,knn.predict(x_train)) f, ax = plt.subplots(figsize =(5,5)) sns.heatmap(cm_knn_train,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax) plt.xlabel("predicted y values") plt.ylabel("real y values") plt.title("Test for Train Dataset") plt.show()
所有分類算法都取得了大約90%的成功。最成功的是高斯樸素貝葉斯,得分為96%。
上文是回歸算法,此文分類
總結
以上是生活随笔為你收集整理的kaggle研究生招生(中)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 不办理转汇业务是什么意思
- 下一篇: 日利率0.045年利率是多少