模型验证
對分類模型的檢驗
加載數據
1 %matplotlib notebook 2 import numpy as np 3 import pandas as pd 4 import seaborn as sns 5 import matplotlib.pyplot as plt 6 from sklearn.model_selection import train_test_split 7 from sklearn.datasets import load_digits 8 9 dataset = load_digits() 10 X, y = dataset.data, dataset.target 11 #統計每個種類的個數 12 for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)): 13 print(class_name,class_count) 0 178 1 182 2 177 3 183 4 181 5 182 6 181 7 179 8 174 9 180 1 # 進行一個數據之間的轉換 2 # Negative class (0) is 'not digit 1' 3 # Positive class (1) is 'digit 1' 4 y_binary_imbalanced = y.copy() 5 y_binary_imbalanced[y_binary_imbalanced != 1] = 0 6 7 print('Original labels:\t', y[1:30]) 8 print('New binary labels:\t', y_binary_imbalanced[1:30]) Original labels: [1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9] New binary labels: [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]1 #np.bincount:用于統計每個索引的總個數 2 np.bincount(y_binary_imbalanced) # Negative class (0) is the most frequent class array([1615, 182])
(索引為0的個數為:1615,索引為1的個數為:182,在這種情況下,比例完全不平衡,inbalanced classes)
使用RBF核函數SVM來建立分類模型 1 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0) 2 3 # Accuracy of Support Vector Machine classifier 4 from sklearn.svm import SVC 5 6 svm = SVC(kernel='rbf', C=1).fit(X_train, y_train) 7 svm.score(X_test, y_test) 0.90888888888888886
DummyClassifier是一個使用簡單規則進行預測的分類器,它可以用作與實際分類器進行比較
的基準,尤其是對于不平衡的類。不能用于實際問題。 1 from sklearn.dummy import DummyClassifier 2 3 # Negative class (0) is most frequent 4 #使用策略(strategy)大頻率來進行擬合 5 dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train) 6 # Therefore the dummy 'most_frequent' classifier always predicts class 0 7 y_dummy_predictions = dummy_majority.predict(X_test) 8 9 y_dummy_predictions array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 1 dummy_majority.score(X_test, y_test) 0.9044444444444445
1 svm = SVC(kernel='linear', C=1).fit(X_train, y_train) 2 svm.score(X_test, y_test) 0.97777777777777775
混淆矩陣 1 from sklearn.metrics import confusion_matrix 2 3 # Negative class (0) is most frequent 4 dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train) 5 y_majority_predicted = dummy_majority.predict(X_test) 6 #產生混淆矩陣 7 confusion = confusion_matrix(y_test, y_majority_predicted) 8 9 print('Most frequent class (dummy classifier)\n', confusion) 1 from sklearn.metrics import confusion_matrix 2 3 # Negative class (0) is most frequent 4 dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train) 5 y_majority_predicted = dummy_majority.predict(X_test) 6 #產生混淆矩陣 7 confusion = confusion_matrix(y_test, y_majority_predicted) 8 9 print('Most frequent class (dummy classifier)\n', confusion) Most frequent class (dummy classifier)[[407 0][ 43 0]]
1 # produces random predictions w/ same class proportion as training set 2 dummy_classprop = DummyClassifier(strategy='stratified').fit(X_train, y_train) 3 y_classprop_predicted = dummy_classprop.predict(X_test) 4 confusion = confusion_matrix(y_test, y_classprop_predicted) 5 6 print('Random class-proportional prediction (dummy classifier)\n', confusion) Random class-proportional prediction (dummy classifier)[[361 46][ 39 4]]
1 svm = SVC(kernel='linear', C=1).fit(X_train, y_train) 2 svm_predicted = svm.predict(X_test) 3 confusion = confusion_matrix(y_test, svm_predicted) 4 5 print('Support vector machine classifier (linear kernel, C=1)\n', confusion) Support vector machine classifier (linear kernel, C=1)[[402 5][ 5 38]]
1 from sklearn.linear_model import LogisticRegression 2 3 lr = LogisticRegression().fit(X_train, y_train) 4 lr_predicted = lr.predict(X_test) 5 confusion = confusion_matrix(y_test, lr_predicted) 6 7 print('Logistic regression classifier (default settings)\n', confusion) Logistic regression classifier (default settings)[[401 6][ 6 37]]
1 from sklearn.tree import DecisionTreeClassifier 2 3 dt = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train) 4 tree_predicted = dt.predict(X_test) 5 confusion = confusion_matrix(y_test, tree_predicted) 6 7 print('Decision tree classifier (max_depth = 2)\n', confusion) Decision tree classifier (max_depth = 2)[[400 7][ 17 26]] 二元分類的評估 1 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 2 # Accuracy = TP + TN / (TP + TN + FP + FN) 3 # Precision = TP / (TP + FP) 4 # Recall = TP / (TP + FN) Also known as sensitivity, or True Positive Rate 5 # F1 = 2 * Precision * Recall / (Precision + Recall) 6 print('Accuracy: {:.2f}'.format(accuracy_score(y_test, tree_predicted))) 7 print('Precision: {:.2f}'.format(precision_score(y_test, tree_predicted))) 8 print('Recall: {:.2f}'.format(recall_score(y_test, tree_predicted))) 9 print('F1: {:.2f}'.format(f1_score(y_test, tree_predicted))) Accuracy: 0.95 Precision: 0.79 Recall: 0.60 F1: 0.68
綜合報告 1 # Combined report with all above metrics 2 from sklearn.metrics import classification_report 3 4 print(classification_report(y_test, tree_predicted, target_names=['not 1', '1'])) precision recall f1-score supportnot 1 0.96 0.98 0.97 4071 0.79 0.60 0.68 43avg / total 0.94 0.95 0.94 450 1 print('Random class-proportional (dummy)\n', 2 classification_report(y_test, y_classprop_predicted, target_names=['not 1', '1'])) 3 print('SVM\n', 4 classification_report(y_test, svm_predicted, target_names = ['not 1', '1'])) 5 print('Logistic regression\n', 6 classification_report(y_test, lr_predicted, target_names = ['not 1', '1'])) 7 print('Decision tree\n', 8 classification_report(y_test, tree_predicted, target_names = ['not 1', '1'])) Random class-proportional (dummy)precision recall f1-score supportnot 1 0.90 0.89 0.89 4071 0.08 0.09 0.09 43avg / total 0.82 0.81 0.82 450SVMprecision recall f1-score supportnot 1 0.99 0.99 0.99 4071 0.88 0.88 0.88 43avg / total 0.98 0.98 0.98 450Logistic regressionprecision recall f1-score supportnot 1 0.99 0.99 0.99 4071 0.86 0.86 0.86 43avg / total 0.97 0.97 0.97 450Decision treeprecision recall f1-score supportnot 1 0.96 0.98 0.97 4071 0.79 0.60 0.68 43avg / total 0.94 0.95 0.94 450
Decision functions(類似cost functions,用于評價樣本預測)
1 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0) 2 y_scores_lr = lr.fit(X_train, y_train).decision_function(X_test) 3 y_score_list = list(zip(y_test[0:20], y_scores_lr[0:20])) 4 5 # show the decision_function scores for first 20 instances 6 y_score_list [(0, -23.172292973469549),(0, -13.542576515500066),(0, -21.717588760007864),(0, -18.903065133316442),(0, -19.733169947138638),(0, -9.7463217496747667),(1, 5.2327155658831117),(0, -19.308012306288916),(0, -25.099330209728528),(0, -21.824312362996),(0, -24.143782750720494),(0, -19.578811099762504),(0, -22.568371393280199),(0, -10.822590225240777),(0, -11.907918741521936),(0, -10.977026853802803),(1, 11.206811164226373),(0, -27.644157619807473),(0, -12.857692102545419),(0, -25.848149140240199)]
#predict_proba()預測為1的可能性
1 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0) 2 y_proba_lr = lr.fit(X_train, y_train).predict_proba(X_test) 3 y_proba_list = list(zip(y_test[0:20], y_proba_lr[0:20,1])) 4 5 # show the probability of positive class for first 20 instances 6 y_proba_list [(0, 8.6377579220606466e-11),(0, 1.3138118599563736e-06),(0, 3.6997386039099659e-10),(0, 6.1730972504865241e-09),(0, 2.6914925394345074e-09),(0, 5.8506057771143608e-05),(1, 0.99468934644404694),(0, 4.1175302368500096e-09),(0, 1.2574750894253029e-11),(0, 3.3252290754668869e-10),(0, 3.269552979937297e-11),(0, 3.1407283576084996e-09),(0, 1.5800864117150149e-10),(0, 1.9943442430612578e-05),(0, 6.7368003023859777e-06),(0, 1.7089540581641637e-05),(1, 0.9999864188091131),(0, 9.8694940340196163e-13),(0, 2.6059983600823614e-06),(0, 5.9469113009063784e-12)]
Precision-recall curves
1 from sklearn.metrics import precision_recall_curve 2 3 precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr) 4 closest_zero = np.argmin(np.abs(thresholds)) 5 closest_zero_p = precision[closest_zero] 6 closest_zero_r = recall[closest_zero] 7 8 plt.figure() 9 plt.xlim([0.0, 1.01]) 10 plt.ylim([0.0, 1.01]) 11 plt.plot(precision, recall, label='Precision-Recall Curve') 12 plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3) 13 plt.xlabel('Precision', fontsize=16) 14 plt.ylabel('Recall', fontsize=16) 15 plt.axes().set_aspect('equal') 16 plt.show()ROC curves, Area-Under-Curve (AUC)
1 from sklearn.metrics import roc_curve, auc 2 3 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0) 4 5 y_score_lr = lr.fit(X_train, y_train).decision_function(X_test) 6 fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr) 7 roc_auc_lr = auc(fpr_lr, tpr_lr) 8 9 plt.figure() 10 plt.xlim([-0.01, 1.00]) 11 plt.ylim([-0.01, 1.01]) 12 plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr)) 13 plt.xlabel('False Positive Rate', fontsize=16) 14 plt.ylabel('True Positive Rate', fontsize=16) 15 plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16) 16 plt.legend(loc='lower right', fontsize=13) 17 plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--') 18 plt.axes().set_aspect('equal') 19 plt.show() 1 from matplotlib import cm 2 3 X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0) 4 5 plt.figure() 6 plt.xlim([-0.01, 1.00]) 7 plt.ylim([-0.01, 1.01]) 8 for g in [0.01, 0.1, 0.20, 1]: 9 svm = SVC(gamma=g).fit(X_train, y_train) 10 y_score_svm = svm.decision_function(X_test) 11 fpr_svm, tpr_svm, _ = roc_curve(y_test, y_score_svm) 12 roc_auc_svm = auc(fpr_svm, tpr_svm) 13 accuracy_svm = svm.score(X_test, y_test) 14 print("gamma = {:.2f} accuracy = {:.2f} AUC = {:.2f}".format(g, accuracy_svm, 15 roc_auc_svm)) 16 plt.plot(fpr_svm, tpr_svm, lw=3, alpha=0.7, 17 label='SVM (gamma = {:0.2f}, area = {:0.2f})'.format(g, roc_auc_svm)) 18 19 plt.xlabel('False Positive Rate', fontsize=16) 20 plt.ylabel('True Positive Rate (Recall)', fontsize=16) 21 plt.plot([0, 1], [0, 1], color='k', lw=0.5, linestyle='--') 22 plt.legend(loc="lower right", fontsize=11) 23 plt.title('ROC curve: (1-of-10 digits classifier)', fontsize=16) 24 plt.axes().set_aspect('equal') 25 26 plt.show() gamma = 0.01 accuracy = 0.91 AUC = 1.00 gamma = 0.10 accuracy = 0.90 AUC = 0.98 gamma = 0.20 accuracy = 0.90 AUC = 0.66 gamma = 1.00 accuracy = 0.90 AUC = 0.50 對多分類模型的驗證方法 多分類模型的混淆矩陣 1 dataset = load_digits() 2 X, y = dataset.data, dataset.target 3 X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X, y, random_state=0) 4 5 6 svm = SVC(kernel = 'linear').fit(X_train_mc, y_train_mc) 7 svm_predicted_mc = svm.predict(X_test_mc) 8 confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc) 9 df_cm = pd.DataFrame(confusion_mc, 10 index = [i for i in range(0,10)], columns = [i for i in range(0,10)]) 11 12 plt.figure(figsize=(5.5,4)) 13 sns.heatmap(df_cm, annot=True) 14 plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc, 15 svm_predicted_mc))) 16 plt.ylabel('True label') 17 plt.xlabel('Predicted label') 18 19 20 svm = SVC(kernel = 'rbf').fit(X_train_mc, y_train_mc) 21 svm_predicted_mc = svm.predict(X_test_mc) 22 confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc) 23 df_cm = pd.DataFrame(confusion_mc, index = [i for i in range(0,10)], 24 columns = [i for i in range(0,10)]) 25 26 plt.figure(figsize = (5.5,4)) 27 sns.heatmap(df_cm, annot=True) 28 plt.title('SVM RBF Kernel \nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc, 29 svm_predicted_mc))) 30 plt.ylabel('True label') 31 plt.xlabel('Predicted label');多分類模型的報告
1 print(classification_report(y_test_mc, svm_predicted_mc)) precision recall f1-score support0 1.00 0.65 0.79 371 1.00 0.23 0.38 432 1.00 0.39 0.56 443 1.00 0.93 0.97 454 0.14 1.00 0.25 385 1.00 0.33 0.50 486 1.00 0.54 0.70 527 1.00 0.35 0.52 488 1.00 0.02 0.04 489 1.00 0.55 0.71 47avg / total 0.93 0.49 0.54 450?微觀平均指標與宏觀平均指標
1 print('Micro-averaged precision = {:.2f} (treat instances equally)' 2 .format(precision_score(y_test_mc, svm_predicted_mc, average = 'micro'))) 3 print('Macro-averaged precision = {:.2f} (treat classes equally)' 4 .format(precision_score(y_test_mc, svm_predicted_mc, average = 'macro'))) Micro-averaged precision = 0.49 (treat instances equally) Macro-averaged precision = 0.91 (treat classes equally) 1 print('Micro-averaged f1 = {:.2f} (treat instances equally)' 2 .format(f1_score(y_test_mc, svm_predicted_mc, average = 'micro'))) 3 print('Macro-averaged f1 = {:.2f} (treat classes equally)' 4 .format(f1_score(y_test_mc, svm_predicted_mc, average = 'macro'))) Micro-averaged f1 = 0.49 (treat instances equally) Macro-averaged f1 = 0.54 (treat classes equally)?回歸模型評估指標
1 %matplotlib notebook 2 import matplotlib.pyplot as plt 3 import numpy as np 4 from sklearn.model_selection import train_test_split 5 from sklearn import datasets 6 from sklearn.linear_model import LinearRegression 7 from sklearn.metrics import mean_squared_error, r2_score 8 from sklearn.dummy import DummyRegressor 9 10 diabetes = datasets.load_diabetes() 11 12 X = diabetes.data[:, None, 6] 13 y = diabetes.target 14 15 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 16 17 lm = LinearRegression().fit(X_train, y_train) 18 lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train) 19 20 y_predict = lm.predict(X_test) 21 y_predict_dummy_mean = lm_dummy_mean.predict(X_test) 22 23 print('Linear model, coefficients: ', lm.coef_) 24 print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, 25 y_predict_dummy_mean))) 26 print("Mean squared error (linear model): {:.2f}".format(mean_squared_error(y_test, y_predict))) 27 print("r2_score (dummy): {:.2f}".format(r2_score(y_test, y_predict_dummy_mean))) 28 print("r2_score (linear model): {:.2f}".format(r2_score(y_test, y_predict))) 29 30 # Plot outputs 31 plt.scatter(X_test, y_test, color='black') 32 plt.plot(X_test, y_predict, color='green', linewidth=2) 33 plt.plot(X_test, y_predict_dummy_mean, color='red', linestyle = 'dashed', 34 linewidth=2, label = 'dummy') 35 36 plt.show() Linear model, coefficients: [-698.80206267] Mean squared error (dummy): 4965.13 Mean squared error (linear model): 4646.74 r2_score (dummy): -0.00 r2_score (linear model): 0.06使用評估指標進行模型選擇
交叉驗證例子
1 from sklearn.model_selection import cross_val_score 2 from sklearn.svm import SVC 3 4 dataset = load_digits() 5 # again, making this a binary problem with 'digit 1' as positive class 6 # and 'not 1' as negative class 7 X, y = dataset.data, dataset.target == 1 8 clf = SVC(kernel='linear', C=1) 9 10 # accuracy is the default scoring metric 11 print('Cross-validation (accuracy)', cross_val_score(clf, X, y, cv=5)) 12 # use AUC as scoring metric 13 print('Cross-validation (AUC)', cross_val_score(clf, X, y, cv=5, scoring = 'roc_auc')) 14 # use recall as scoring metric 15 print('Cross-validation (recall)', cross_val_score(clf, X, y, cv=5, scoring = 'recall')) Cross-validation (accuracy) [ 0.91944444 0.98611111 0.97214485 0.97493036 0.96935933] Cross-validation (AUC) [ 0.9641871 0.9976571 0.99372205 0.99699002 0.98675611] Cross-validation (recall) [ 0.81081081 0.89189189 0.83333333 0.83333333 0.83333333]網格搜索示例
1 from sklearn.svm import SVC 2 from sklearn.model_selection import GridSearchCV 3 from sklearn.metrics import roc_auc_score 4 5 dataset = load_digits() 6 X, y = dataset.data, dataset.target == 1 7 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 8 9 clf = SVC(kernel='rbf') 10 grid_values = {'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100]} 11 12 # default metric to optimize over grid parameters: accuracy 13 grid_clf_acc = GridSearchCV(clf, param_grid = grid_values) 14 grid_clf_acc.fit(X_train, y_train) 15 y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test) 16 17 print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_) 18 print('Grid best score (accuracy): ', grid_clf_acc.best_score_) 19 20 # alternative metric to optimize over grid parameters: AUC 21 grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc') 22 grid_clf_auc.fit(X_train, y_train) 23 y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test) 24 25 print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc)) 26 print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_) 27 print('Grid best score (AUC): ', grid_clf_auc.best_score_) Grid best parameter (max. accuracy): {'gamma': 0.001} Grid best score (accuracy): 0.996288047513 Test set AUC: 0.999828581224 Grid best parameter (max. AUC): {'gamma': 0.001} Grid best score (AUC): 0.99987412783 1 #Evaluation metrics supported for model selection 2 from sklearn.metrics.scorer import SCORERS 3 4 print(sorted(list(SCORERS.keys()))) ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro','f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error',
'median_absolute_error', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_square
d_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_mic
ro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro',
'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']
?使用數字數據集的雙特征分類示例
使用不同的評估指標優化分類器
1 from sklearn.datasets import load_digits 2 from sklearn.model_selection import train_test_split 3 from adspy_shared_utilities import plot_class_regions_for_classifier_subplot 4 from sklearn.svm import SVC 5 from sklearn.model_selection import GridSearchCV 6 7 8 dataset = load_digits() 9 X, y = dataset.data, dataset.target == 1 10 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 11 12 # Create a two-feature input vector matching the example plot above 13 # We jitter the points (add a small amount of random noise) in case there are areas 14 # in feature space where many instances have the same features. 15 jitter_delta = 0.25 16 X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta 17 X_twovar_test = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta 18 19 clf = SVC(kernel = 'linear').fit(X_twovar_train, y_train) 20 grid_values = {'class_weight':['balanced', {1:2},{1:3},{1:4},{1:5},{1:10},{1:20},{1:50}]} 21 plt.figure(figsize=(9,6)) 22 for i, eval_metric in enumerate(('precision','recall', 'f1','roc_auc')): 23 grid_clf_custom = GridSearchCV(clf, param_grid=grid_values, scoring=eval_metric) 24 grid_clf_custom.fit(X_twovar_train, y_train) 25 print('Grid best parameter (max. {0}): {1}' 26 .format(eval_metric, grid_clf_custom.best_params_)) 27 print('Grid best score ({0}): {1}' 28 .format(eval_metric, grid_clf_custom.best_score_)) 29 plt.subplots_adjust(wspace=0.3, hspace=0.3) 30 plot_class_regions_for_classifier_subplot(grid_clf_custom, X_twovar_test, y_test, None, 31 None, None, plt.subplot(2, 2, i+1)) 32 33 plt.title(eval_metric+'-oriented SVC') 34 plt.tight_layout() 35 plt.show() Grid best parameter (max. precision): {'class_weight': {1: 2}} Grid best score (precision): 0.5379994354058584 Grid best parameter (max. recall): {'class_weight': {1: 50}} Grid best score (recall): 0.921184706893106 Grid best parameter (max. f1): {'class_weight': {1: 3}} Grid best score (f1): 0.5079935126308859 Grid best parameter (max. roc_auc): {'class_weight': {1: 20}} Grid best score (roc_auc): 0.8889416320163174?默認SVC分類器的精確召回曲線(平衡類別權重)
?
1 from sklearn.model_selection import train_test_split 2 from sklearn.metrics import precision_recall_curve 3 from adspy_shared_utilities import plot_class_regions_for_classifier 4 from sklearn.svm import SVC 5 6 dataset = load_digits() 7 X, y = dataset.data, dataset.target == 1 8 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 9 10 # create a two-feature input vector matching the example plot above 11 jitter_delta = 0.25 12 X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta 13 X_twovar_test = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta 14 15 clf = SVC(kernel='linear', class_weight='balanced').fit(X_twovar_train, y_train) 16 17 y_scores = clf.decision_function(X_twovar_test) 18 19 precision, recall, thresholds = precision_recall_curve(y_test, y_scores) 20 closest_zero = np.argmin(np.abs(thresholds)) 21 closest_zero_p = precision[closest_zero] 22 closest_zero_r = recall[closest_zero] 23 24 plot_class_regions_for_classifier(clf, X_twovar_test, y_test) 25 plt.title("SVC, class_weight = 'balanced', optimized for accuracy") 26 plt.show() 27 28 plt.figure() 29 plt.xlim([0.0, 1.01]) 30 plt.ylim([0.0, 1.01]) 31 plt.title ("Precision-recall curve: SVC, class_weight = 'balanced'") 32 plt.plot(precision, recall, label = 'Precision-Recall Curve') 33 plt.plot(closest_zero_p, closest_zero_r, 'o', markersize=12, fillstyle='none', c='r', mew=3) 34 plt.xlabel('Precision', fontsize=16) 35 plt.ylabel('Recall', fontsize=16) 36 plt.axes().set_aspect('equal') 37 plt.show() 38 print('At zero threshold, precision: {:.2f}, recall: {:.2f}' 39 .format(closest_zero_p, closest_zero_r)) At zero threshold, precision: 0.22, recall: 0.74?
轉載于:https://www.cnblogs.com/zhengzhe/p/8547810.html
總結
- 上一篇: js特效
- 下一篇: range 和 xrange