【新手福音】分享一个自己制作的机器学习工具集合
生活随笔
收集整理的這篇文章主要介紹了
【新手福音】分享一个自己制作的机器学习工具集合
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
機器學習小工具集合
我!李英俊!覺得這篇文章超級有用!值得你一看!
功能: 機器學習工具集合,直接導入一個類,傳參訓練集,驗證集就能生成報告
使用方式:
tips: 下面會附上代碼和兩個使用demo,也會貼上github上的鏈接,如果大家需要什么新的功能可以留言告訴我,最新的更新應該會在github上同步,希望大家星星我,有空的話再更新博客。求個贊不過分吧!轉載請一定表明出處喲~
點我打開github地址,求關注
結果展示:
-
輸出example
-
輸出報告example
-
優化報告example
-
代碼具體實現:
#!/usr/bin/env python # -*- coding: UTF-8 -*- # coding=utf-8 """ @author: Li Tian @contact: 694317828@qq.com @software: pycharm @file: ML_combines.py @time: 2019/9/23 8:53 @desc: 機器學習工具集合,直接寫一個類,傳參訓練集,驗證集就能生成報告 """ from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split from collections import OrderedDictclass MLTools:"""包含:多項式樸素貝葉斯, 高斯樸素貝葉斯, K最近鄰, 邏輯回歸, 支持向量機, 決策樹, 隨機森林, Adaboost, GBDT, xgboost"""random_state = 42# 粗略 隨機森林調參數值# 參考鏈接1:https://blog.csdn.net/geduo_feng/article/details/79558572# 參考鏈接2:https://blog.csdn.net/qq_35040963/article/details/88832030parameter_tree = {# 集成模型數量越小越簡單'n_estimators': range(10, 200, 20),# 最大樹深度越小越簡單'max_depth': range(1, 10, 1),# 最小樣本分割數越大越簡單'min_samples_split': list(range(2, 10, 1))[::-1],}parameter_tree = OrderedDict(parameter_tree)def __init__(self, X_train, y_train, X_test, y_test):self.X_train = X_trainself.y_train = y_trainself.X_test = X_testself.y_test = y_test# Multinomial Naive Bayes Classifier / 多項式樸素貝葉斯def multinomial_naive_bayes_classifier(self):from sklearn.naive_bayes import MultinomialNBmodel = MultinomialNB(alpha=0.01)model.fit(self.X_train, self.y_train)return model, None# Gaussian Naive Bayes Classifier / 高斯樸素貝葉斯def gaussian_naive_bayes_classifier(self):from sklearn.naive_bayes import GaussianNBmodel = GaussianNB()model.fit(self.X_train, self.y_train)return model, None# KNN Classifier / K最近鄰def knn_classifier(self):from sklearn.neighbors import KNeighborsClassifiermodel = KNeighborsClassifier()model.fit(self.X_train, self.y_train)return model, None# Logistic Regression Classifier / 邏輯回歸def logistic_regression_classifier(self):from sklearn.linear_model import LogisticRegressionmodel = LogisticRegression(penalty='l2')model.fit(self.X_train, self.y_train)return model, None# SVM Classifier / 支持向量機def svm_classifier(self):from sklearn.svm import SVCmodel = SVC(kernel='rbf', probability=True)model.fit(self.X_train, self.y_train)return model, None# Decision Tree Classifier / 決策樹def decision_tree_classifier(self):from sklearn.tree import DecisionTreeClassifiermodel = DecisionTreeClassifier()model.fit(self.X_train, self.y_train)return model, None# Random Forest Classifier / 隨機森林def random_forest_classifier(self, is_adjust=True):from sklearn.ensemble import RandomForestClassifier# 訓練普通模型model = RandomForestClassifier()model.fit(self.X_train, self.y_train)test_pred = model.predict(self.X_test)min_score = f1_score(self.y_test, test_pred, average='macro')if not is_adjust:return model, Nonemax_score = 0best_param = [None, None, None]for p1 in MLTools.parameter_tree['n_estimators']:for p2 in MLTools.parameter_tree['max_depth']:for p3 in MLTools.parameter_tree['min_samples_split']:test_model = RandomForestClassifier(random_state=MLTools.random_state, n_estimators=p1,max_depth=p2, min_samples_split=p3, n_jobs=-1)test_model.fit(self.X_train, self.y_train)test_pred = test_model.predict(self.X_test)new_score = f1_score(self.y_test, test_pred, average='macro')# 輸出檢查每一個細節,可能存在不同的參數得到相同的精度值# print('n_estimators=' + str(p1) + 'max_depth=' + str(p2) + 'min_samples_split=' + str(p3) + '-->' + str(new_score))if new_score > max_score:max_score = new_scorebest_param = [p1, p2, p3]best_model = RandomForestClassifier(random_state=MLTools.random_state, n_estimators=best_param[0],max_depth=best_param[1], min_samples_split=best_param[2], n_jobs=-1)best_model.fit(self.X_train, self.y_train)word = '-- optimized parameters: \n'count = 0for name in MLTools.parameter_tree.keys():word = word + name + ' = ' + str(best_param[count]) + '\n'count += 1word = word + 'f1_macro: ' + '%.4f' % min_score + '-->' + '%.4f' % max_score + "\n"return best_model, word# AdaBoost Classifier / 自適應提升法def adaboost_classifier(self, is_adjust=True):from sklearn.ensemble import AdaBoostClassifierfrom sklearn.tree import DecisionTreeClassifiermodel = AdaBoostClassifier()model.fit(self.X_train, self.y_train)test_pred = model.predict(self.X_test)min_score = f1_score(self.y_test, test_pred, average='macro')if not is_adjust:return model, Nonemax_score = 0best_param = [None, None, None]for p1 in MLTools.parameter_tree['n_estimators']:for p2 in MLTools.parameter_tree['max_depth']:for p3 in MLTools.parameter_tree['min_samples_split']:test_model = AdaBoostClassifier(DecisionTreeClassifier(random_state=MLTools.random_state,max_depth=p2, min_samples_split=p3),random_state=MLTools.random_state, n_estimators=p1)test_model.fit(self.X_train, self.y_train)test_pred = test_model.predict(self.X_test)new_score = f1_score(self.y_test, test_pred, average='macro')if new_score > max_score:max_score = new_scorebest_param = [p1, p2, p3]best_model = AdaBoostClassifier(DecisionTreeClassifier(random_state=MLTools.random_state,max_depth=best_param[1], min_samples_split=best_param[2]),random_state=MLTools.random_state, n_estimators=best_param[0])best_model.fit(self.X_train, self.y_train)word = '-- optimized parameters: \n'count = 0for name in MLTools.parameter_tree.keys():word = word + name + ' = ' + str(best_param[count]) + '\n'count += 1word = word + 'f1_macro: ' + '%.4f' % min_score + '-->' + '%.4f' % max_score + "\n"return best_model, word# GBDT(Gradient Boosting Decision Tree) Classifier / 梯度提升決策樹def gradient_boosting_classifier(self, is_adjust=True):from sklearn.ensemble import GradientBoostingClassifiermodel = GradientBoostingClassifier()model.fit(self.X_train, self.y_train)test_pred = model.predict(self.X_test)min_score = f1_score(self.y_test, test_pred, average='macro')if not is_adjust:return model, Nonemax_score = 0best_param = [None, None, None]for p1 in MLTools.parameter_tree['n_estimators']:for p2 in MLTools.parameter_tree['max_depth']:for p3 in MLTools.parameter_tree['min_samples_split']:test_model = GradientBoostingClassifier(random_state=MLTools.random_state, n_estimators=p1,max_depth=p2, min_samples_split=p3)test_model.fit(self.X_train, self.y_train)test_pred = test_model.predict(self.X_test)new_score = f1_score(self.y_test, test_pred, average='macro')if new_score > max_score:max_score = new_scorebest_param = [p1, p2, p3]best_model = GradientBoostingClassifier(random_state=MLTools.random_state, n_estimators=best_param[0],max_depth=best_param[1], min_samples_split=best_param[2])best_model.fit(self.X_train, self.y_train)word = '-- optimized parameters: \n'count = 0for name in MLTools.parameter_tree.keys():word = word + name + ' = ' + str(best_param[count]) + '\n'count += 1word = word + 'f1_macro: ' + '%.4f' % min_score + '-->' + '%.4f' % max_score + "\n"return best_model, word# xgboost / 極端梯度提升def xgboost_classifier(self, is_adjust=True):from xgboost import XGBClassifiermodel = XGBClassifier()model.fit(self.X_train, self.y_train)test_pred = model.predict(self.X_test)min_score = f1_score(self.y_test, test_pred, average='macro')if not is_adjust:return model, Nonemax_score = 0best_param = [0, 0, 0]for p1 in MLTools.parameter_tree['n_estimators']:for p2 in MLTools.parameter_tree['max_depth']:for p3 in MLTools.parameter_tree['min_samples_split']:test_model = XGBClassifier(random_state=MLTools.random_state, n_estimators=p1,max_depth=p2, min_samples_split=p3, n_jobs=-1)test_model.fit(self.X_train, self.y_train)test_pred = test_model.predict(self.X_test)new_score = f1_score(self.y_test, test_pred, average='macro')if new_score > max_score:max_score = new_scorebest_param = [p1, p2, p3]best_model = XGBClassifier(random_state=MLTools.random_state, n_estimators=best_param[0],max_depth=best_param[1], min_samples_split=best_param[2], n_jobs=-1)best_model.fit(self.X_train, self.y_train)word = '-- optimized parameters: \n'count = 0for name in MLTools.parameter_tree.keys():word = word + name + ' = ' + str(best_param[count]) + '\n'count += 1word = word + 'f1_macro: ' + '%.4f' % min_score + '-->' + '%.4f' % max_score + "\n"return best_model, worddef model_building(X_train, y_train, X_test, y_test, save_path, target_names=None, just_emsemble=False):"""訓練模型,并得到結果,并重新訓練所有數據,保存模型:param save_path: 模型的保存路徑:param target_names: 樣本標簽名:param just_emsemble: 已經有了其他模型,只對模型進行集成"""from sklearn.metrics import classification_reportimport joblibimport osimport numpy as npif not just_emsemble:tool = MLTools(X_train, y_train, X_test, y_test)models = [tool.multinomial_naive_bayes_classifier(),tool.gaussian_naive_bayes_classifier(),tool.knn_classifier(),tool.logistic_regression_classifier(),tool.svm_classifier(),tool.decision_tree_classifier(),tool.random_forest_classifier(),tool.adaboost_classifier(),tool.gradient_boosting_classifier(),tool.xgboost_classifier()]model_names = ['多項式樸素貝葉斯', '高斯樸素貝葉斯', 'K最近鄰', '邏輯回歸', '支持向量機', '決策樹', '隨機森林', 'Adaboost', 'GBDT', 'xgboost']# 遍歷每個模型f = open(save_path + 'report.txt', 'w+')g = open(save_path + 'optimized.txt', 'w+')for count in range(len(models)):model, optimized = models[count]model_name = model_names[count]print(str(count + 1) + '. 正在運行:', model_name, '...')train_pred = model.predict(X_train)test_pred = model.predict(X_test)train = classification_report(y_train, train_pred, target_names=target_names)test = classification_report(y_test, test_pred, target_names=target_names)f.write('- ' + model_name + '\n')f.write('-- 【訓練集】' + '\n')f.writelines(train)f.write('\n')f.write('-- 【測試集】' + '\n')f.writelines(test)f.write('\n')g.write('- ' + model_name + '\n')if optimized:g.write(optimized)g.write('\n')model.fit(np.r_[np.array(X_train), np.array(X_test)], np.r_[np.array(y_train), np.array(y_test)])joblib.dump(model, os.path.join(save_path, model_name + '.plk'))f.close()g.close()# 開始集成模型from sklearn.ensemble import VotingClassifierf = open(save_path + 'report.txt', 'a+')emsemble_names = ['隨機森林', 'Adaboost', 'GBDT', 'xgboost']emsemble_path = [os.path.join(save_path, i + '.plk') for i in emsemble_names]estimators = []for x, y in zip(emsemble_names, emsemble_path):estimators.append((x, joblib.load(y)))voting_clf = VotingClassifier(estimators, voting='soft', n_jobs=-1)voting_clf.fit(X_train, y_train)print('11. 正在運行:集成模型...')train_pred = voting_clf.predict(X_train)test_pred = voting_clf.predict(X_test)train = classification_report(y_train, train_pred, target_names=target_names)test = classification_report(y_test, test_pred, target_names=target_names)f.write('- ' + '集成模型' + '\n')f.write('-- 【訓練集】' + '\n')f.writelines(train)f.write('\n')f.write('-- 【測試集】' + '\n')f.writelines(test)f.write('\n')voting_clf.fit(np.r_[np.array(X_train), np.array(X_test)], np.r_[np.array(y_train), np.array(y_test)])joblib.dump(voting_clf, os.path.join(save_path, '集成模型' + '.plk'))f.close()def example1():"""鳶尾花數據集進行測試"""from sklearn.datasets import load_irisiris = load_iris()iris_data = iris['data']iris_target = iris['target']iris_names = iris['target_names']X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_target, test_size=0.2, random_state=42)model_building(X_train, y_train, X_test, y_test, save_path='./models/', target_names=iris_names)def example2():"""手寫數據集進行測試"""from sklearn.datasets import load_digitsimport numpy as npdigits = load_digits()digits_data = digits['images']digits_target = digits['target']digits_names = digits['target_names']shape = digits_data.shapeX = np.array(digits_data).reshape(shape[0], shape[1] * shape[2])a, b = 4, 9index1 = digits_target == aindex2 = digits_target == bX = np.r_[X[index1], X[index2]]y = np.r_[digits_target[index1], digits_target[index2]]names = [str(a), str(b)]X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)model_building(X_train, y_train, X_test, y_test, save_path='./models2/', target_names=names)if __name__ == '__main__':example1()
我的CSDN:https://blog.csdn.net/qq_21579045
我的博客園:https://www.cnblogs.com/lyjun/
我的Github:https://github.com/TinyHandsome
紙上得來終覺淺,絕知此事要躬行~
歡迎大家過來OB~
by 李英俊小朋友
總結
以上是生活随笔為你收集整理的【新手福音】分享一个自己制作的机器学习工具集合的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Samba安装,你可能没有权限使用网络资
- 下一篇: 2019-新年新计划