LightGBM用法速查表
生活随笔
收集整理的這篇文章主要介紹了
LightGBM用法速查表
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
LightGBM用法速查表
1.讀取csv數據并指定參數建模
# coding: utf-8 import json import lightgbm as lgb import pandas as pd from sklearn.metrics import mean_squared_error # 加載數據集合 print('Load data...') df_train = pd.read_csv('regression.train.txt', header=None, sep='\t') df_test = pd.read_csv('regression.test.txt', header=None, sep='\t') # 設定訓練集和測試集 y_train = df_train[0].values y_test = df_test[0].values X_train = df_train.drop(0, axis=1).values X_test = df_test.drop(0, axis=1).values # 構建lgb中的Dataset格式 lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)# 敲定好一組參數 params = {'task': 'train','boosting_type': 'gbdt','objective': 'regression','metric': {'l2', 'auc'},'num_leaves': 31,'learning_rate': 0.05,'feature_fraction': 0.9,'bagging_fraction': 0.8,'bagging_freq': 5,'verbose': 0 }print('開始訓練...') # 訓練 gbm = lgb.train(params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5)# 保存模型 print('保存模型...') # 保存模型到文件中 gbm.save_model('model.txt')print('開始預測...') # 預測 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # 評估 print('預估結果的rmse為:') print(mean_squared_error(y_test, y_pred) ** 0.5) #Load data... #開始訓練... #[1] valid_0's l2: 0.24288 valid_0's auc: 0.764496 #Training until validation scores don't improve for 5 rounds. #[2] valid_0's l2: 0.239307 valid_0's auc: 0.766173 #[3] valid_0's l2: 0.235559 valid_0's auc: 0.785547 #[4] valid_0's l2: 0.230771 valid_0's auc: 0.797786 #[5] valid_0's l2: 0.226297 valid_0's auc: 0.805155 #[6] valid_0's l2: 0.223692 valid_0's auc: 0.800979 #[7] valid_0's l2: 0.220941 valid_0's auc: 0.806566 #[8] valid_0's l2: 0.217982 valid_0's auc: 0.808566 #[9] valid_0's l2: 0.215351 valid_0's auc: 0.809041 #[10] valid_0's l2: 0.213064 valid_0's auc: 0.805953 #[11] valid_0's l2: 0.211053 valid_0's auc: 0.804631 #[12] valid_0's l2: 0.209336 valid_0's auc: 0.802922 #[13] valid_0's l2: 0.207492 valid_0's auc: 0.802011 #[14] valid_0's l2: 0.206016 valid_0's auc: 0.80193 #Early stopping, best iteration is: #[9] valid_0's l2: 0.215351 valid_0's auc: 0.809041 #保存模型... #開始預測... #預估結果的rmse為: #0.46405937946792122.添加樣本權重訓練
# coding: utf-8 import json import lightgbm as lgb import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error import warnings warnings.filterwarnings("ignore") # 加載數據集 print('加載數據...') df_train = pd.read_csv('binary.train', header=None, sep='\t') df_test = pd.read_csv('./data/binary.test', header=None, sep='\t') W_train = pd.read_csv('binary.train.weight', header=None)[0] W_test = pd.read_csv('binary.test.weight', header=None)[0]y_train = df_train[0].values y_test = df_test[0].values X_train = df_train.drop(0, axis=1).values X_test = df_test.drop(0, axis=1).values num_train, num_feature = X_train.shape# 加載數據的同時加載權重 lgb_train = lgb.Dataset(X_train, y_train,weight=W_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,weight=W_test, free_raw_data=False)# 設定參數 params = {'boosting_type': 'gbdt','objective': 'binary','metric': 'binary_logloss','num_leaves': 31,'learning_rate': 0.05,'feature_fraction': 0.9,'bagging_fraction': 0.8,'bagging_freq': 5,'verbose': 0 }# 產出特征名稱 feature_name = ['feature_' + str(col) for col in range(num_feature)]print('開始訓練...') gbm = lgb.train(params,lgb_train,num_boost_round=10,valid_sets=lgb_train, # 評估訓練集feature_name=feature_name,categorical_feature=[21]) #加載數據... #開始訓練... #[1] valid_0's binary_logloss: 0.681265 #[2] valid_0's binary_logloss: 0.673318 #[3] valid_0's binary_logloss: 0.664193 #[4] valid_0's binary_logloss: 0.655501 #[5] valid_0's binary_logloss: 0.650956 #[6] valid_0's binary_logloss: 0.644803 #[7] valid_0's binary_logloss: 0.637567 #[8] valid_0's binary_logloss: 0.631224 #[9] valid_0's binary_logloss: 0.624958 #[10] valid_0's binary_logloss: 0.6193983.模型的載入與預測
# 查看特征名稱 print('完成10輪訓練...') print('第7個特征為:') print(repr(lgb_train.feature_name[6]))# 存儲模型 gbm.save_model('./model/lgb_model.txt')# 特征名稱 print('特征名稱:') print(gbm.feature_name())# 特征重要度 print('特征重要度:') print(list(gbm.feature_importance()))# 加載模型 print('加載模型用于預測') bst = lgb.Booster(model_file='./model/lgb_model.txt') # 預測 y_pred = bst.predict(X_test) # 在測試集評估效果 print('在測試集上的rmse為:') print(mean_squared_error(y_test, y_pred) ** 0.5) # 查看特征名稱 print('完成10輪訓練...') print('第7個特征為:') print(repr(lgb_train.feature_name[6]))# 存儲模型 gbm.save_model('./model/lgb_model.txt')# 特征名稱 print('特征名稱:') print(gbm.feature_name())# 特征重要度 print('特征重要度:') print(list(gbm.feature_importance()))# 加載模型 print('加載模型用于預測') bst = lgb.Booster(model_file='./model/lgb_model.txt') # 預測 y_pred = bst.predict(X_test) # 在測試集評估效果 print('在測試集上的rmse為:') print(mean_squared_error(y_test, y_pred) ** 0.5) # 查看特征名稱 print('完成10輪訓練...') print('第7個特征為:') print(repr(lgb_train.feature_name[6])) ? # 存儲模型 gbm.save_model('./model/lgb_model.txt') ? # 特征名稱 print('特征名稱:') print(gbm.feature_name()) ? # 特征重要度 print('特征重要度:') print(list(gbm.feature_importance())) ? # 加載模型 print('加載模型用于預測') bst = lgb.Booster(model_file='./model/lgb_model.txt') # 預測 y_pred = bst.predict(X_test) # 在測試集評估效果 print('在測試集上的rmse為:') print(mean_squared_error(y_test, y_pred) ** 0.5) #完成10輪訓練... #第7個特征為: #'feature_6' #特征名稱: #[u'feature_0', u'feature_1', u'feature_2', u'feature_3', u'feature_4', u'feature_5', u'feature_6', u'feature_7', u'feature_8', u'feature_9', u'feature_10', u'feature_11', u'feature_12', u'feature_13', u'feature_14', u'feature_15', u'feature_16', u'feature_17', u'feature_18', u'feature_19', u'feature_20', u'feature_21', u'feature_22', u'feature_23', u'feature_24', u'feature_25', u'feature_26', u'feature_27'] #特征重要度: #[8, 5, 1, 19, 7, 33, 2, 0, 2, 10, 5, 2, 0, 9, 3, 3, 0, 2, 2, 5, 1, 0, 36, 3, 33, 45, 29, 35] #加載模型用于預測 #在測試集上的rmse為: #0.46292456076369254.接著之前的模型繼續訓練
# 繼續訓練 # 從./model/model.txt中加載模型初始化 gbm = lgb.train(params,lgb_train,num_boost_round=10,init_model='./model/lgb_model.txt',valid_sets=lgb_eval)print('以舊模型為初始化,完成第 10-20 輪訓練...')# 在訓練的過程中調整超參數 # 比如這里調整的是學習率 gbm = lgb.train(params,lgb_train,num_boost_round=10,init_model=gbm,learning_rates=lambda iter: 0.05 * (0.99 ** iter),valid_sets=lgb_eval)print('逐步調整學習率完成第 20-30 輪訓練...')# 調整其他超參數 gbm = lgb.train(params,lgb_train,num_boost_round=10,init_model=gbm,valid_sets=lgb_eval,callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)]) #print('逐步調整bagging比率完成第 30-40 輪訓練...') #[11] valid_0's binary_logloss: 0.616177 #[12] valid_0's binary_logloss: 0.611792 #[13] valid_0's binary_logloss: 0.607043 #[14] valid_0's binary_logloss: 0.602314 #[15] valid_0's binary_logloss: 0.598433 #[16] valid_0's binary_logloss: 0.595238 #[17] valid_0's binary_logloss: 0.592047 #[18] valid_0's binary_logloss: 0.588673 #[19] valid_0's binary_logloss: 0.586084 #[20] valid_0's binary_logloss: 0.584033 #以舊模型為初始化,完成第 10-20 輪訓練... #[21] valid_0's binary_logloss: 0.616177 #[22] valid_0's binary_logloss: 0.611834 #[23] valid_0's binary_logloss: 0.607177 #[24] valid_0's binary_logloss: 0.602577 #[25] valid_0's binary_logloss: 0.59831 #[26] valid_0's binary_logloss: 0.595259 #[27] valid_0's binary_logloss: 0.592201 #[28] valid_0's binary_logloss: 0.589017 #[29] valid_0's binary_logloss: 0.586597 #[30] valid_0's binary_logloss: 0.584454 #逐步調整學習率完成第 20-30 輪訓練... #[31] valid_0's binary_logloss: 0.616053 #[32] valid_0's binary_logloss: 0.612291 #[33] valid_0's binary_logloss: 0.60856 #[34] valid_0's binary_logloss: 0.605387 #[35] valid_0's binary_logloss: 0.601744 #[36] valid_0's binary_logloss: 0.598556 #[37] valid_0's binary_logloss: 0.595585 #[38] valid_0's binary_logloss: 0.593228 #[39] valid_0's binary_logloss: 0.59018 #[40] valid_0's binary_logloss: 0.588391 #逐步調整bagging比率完成第 30-40 輪訓練...5.自定義損失函數
# 類似在xgboost中的形式 # 自定義損失函數需要 def loglikelood(preds, train_data):labels = train_data.get_label()preds = 1. / (1. + np.exp(-preds))grad = preds - labelshess = preds * (1. - preds)return grad, hess# 自定義評估函數 def binary_error(preds, train_data):labels = train_data.get_label()return 'error', np.mean(labels != (preds > 0.5)), Falsegbm = lgb.train(params,lgb_train,num_boost_round=10,init_model=gbm,fobj=loglikelood,feval=binary_error,valid_sets=lgb_eval)print('用自定義的損失函數與評估標準完成第40-50輪...') #[41] valid_0's binary_logloss: 0.614429 valid_0's error: 0.268 #[42] valid_0's binary_logloss: 0.610689 valid_0's error: 0.26 #[43] valid_0's binary_logloss: 0.606267 valid_0's error: 0.264 #[44] valid_0's binary_logloss: 0.601949 valid_0's error: 0.258 #[45] valid_0's binary_logloss: 0.597271 valid_0's error: 0.266 #[46] valid_0's binary_logloss: 0.593971 valid_0's error: 0.276 #[47] valid_0's binary_logloss: 0.591427 valid_0's error: 0.278 #[48] valid_0's binary_logloss: 0.588301 valid_0's error: 0.284 #[49] valid_0's binary_logloss: 0.586562 valid_0's error: 0.288 #[50] valid_0's binary_logloss: 0.584056 valid_0's error: 0.288 #用自定義的損失函數與評估標準完成第40-50輪...sklearn與LightGBM配合使用
1.LightGBM建模,sklearn評估
# coding: utf-8 import lightgbm as lgb import pandas as pd from sklearn.metrics import mean_squared_error from sklearn.model_selection import GridSearchCV# 加載數據 print('加載數據...') df_train = pd.read_csv('regression.train.txt', header=None, sep='\t') df_test = pd.read_csv('.regression.test.txt', header=None, sep='\t')# 取出特征和標簽 y_train = df_train[0].values y_test = df_test[0].values X_train = df_train.drop(0, axis=1).values X_test = df_test.drop(0, axis=1).valuesprint('開始訓練...') # 直接初始化LGBMRegressor # 這個LightGBM的Regressor和sklearn中其他Regressor基本是一致的 gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.05,n_estimators=20)# 使用fit函數擬合 gbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5)# 預測 print('開始預測...') y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # 評估預測結果 print('預測結果的rmse是:') print(mean_squared_error(y_test, y_pred) ** 0.5)#加載數據... #開始訓練... #[1] valid_0's l1: 0.491735 #Training until validation scores don't improve for 5 rounds. #[2] valid_0's l1: 0.486563 #[3] valid_0's l1: 0.481489 #[4] valid_0's l1: 0.476848 #[5] valid_0's l1: 0.47305 #[6] valid_0's l1: 0.469049 #[7] valid_0's l1: 0.465556 #[8] valid_0's l1: 0.462208 #[9] valid_0's l1: 0.458676 #[10] valid_0's l1: 0.454998 #[11] valid_0's l1: 0.452047 #[12] valid_0's l1: 0.449158 #[13] valid_0's l1: 0.44608 #[14] valid_0's l1: 0.443554 #[15] valid_0's l1: 0.440643 #[16] valid_0's l1: 0.437687 #[17] valid_0's l1: 0.435454 #[18] valid_0's l1: 0.433288 #[19] valid_0's l1: 0.431297 #[20] valid_0's l1: 0.428946 #Did not meet early stopping. Best iteration is: #[20] valid_0's l1: 0.428946 #開始預測... #預測結果的rmse是: #0.44411533442542082.網格搜索查找最優超參數
# 配合scikit-learn的網格搜索交叉驗證選擇最優超參數 estimator = lgb.LGBMRegressor(num_leaves=31)param_grid = {'learning_rate': [0.01, 0.1, 1],'n_estimators': [20, 40] }gbm = GridSearchCV(estimator, param_grid)gbm.fit(X_train, y_train)print('用網格搜索找到的最優超參數為:') print(gbm.best_params_) #用網格搜索找到的最優超參數為: #{'n_estimators': 40, 'learning_rate': 0.1}3.繪圖解釋
# coding: utf-8 import lightgbm as lgb import pandas as pdtry:import matplotlib.pyplot as plt except ImportError:raise ImportError('You need to install matplotlib for plotting.')# 加載數據集 print('加載數據...') df_train = pd.read_csv('./data/regression.train.txt', header=None, sep='\t') df_test = pd.read_csv('./data/regression.test.txt', header=None, sep='\t')# 取出特征和標簽 y_train = df_train[0].values y_test = df_test[0].values X_train = df_train.drop(0, axis=1).values X_test = df_test.drop(0, axis=1).values# 構建lgb中的Dataset數據格式 lgb_train = lgb.Dataset(X_train, y_train) lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)# 設定參數 params = {'num_leaves': 5,'metric': ('l1', 'l2'),'verbose': 0 }evals_result = {} # to record eval results for plottingprint('開始訓練...') # 訓練 gbm = lgb.train(params,lgb_train,num_boost_round=100,valid_sets=[lgb_train, lgb_test],feature_name=['f' + str(i + 1) for i in range(28)],categorical_feature=[21],evals_result=evals_result,verbose_eval=10)print('在訓練過程中繪圖...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show()print('畫出特征重要度...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show()print('畫出第84顆樹...') ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show()#print('用graphviz畫出第84顆樹...') #graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') #graph.render(view=True)#加載數據... #開始訓練... #[10] training's l2: 0.217995 training's l1: 0.457448 valid_1's l2: 0.21641 valid_1's l1: 0.456464 #[20] training's l2: 0.205099 training's l1: 0.436869 valid_1's l2: 0.201616 valid_1's l1: 0.434057 #[30] training's l2: 0.197421 training's l1: 0.421302 valid_1's l2: 0.192514 valid_1's l1: 0.417019 #[40] training's l2: 0.192856 training's l1: 0.411107 valid_1's l2: 0.187258 valid_1's l1: 0.406303 #[50] training's l2: 0.189593 training's l1: 0.403695 valid_1's l2: 0.183688 valid_1's l1: 0.398997 #[60] training's l2: 0.187043 training's l1: 0.398704 valid_1's l2: 0.181009 valid_1's l1: 0.393977 #[70] training's l2: 0.184982 training's l1: 0.394876 valid_1's l2: 0.178803 valid_1's l1: 0.389805 #[80] training's l2: 0.1828 training's l1: 0.391147 valid_1's l2: 0.176799 valid_1's l1: 0.386476 #[90] training's l2: 0.180817 training's l1: 0.388101 valid_1's l2: 0.175775 valid_1's l1: 0.384404 #[100] training's l2: 0.179171 training's l1: 0.385174 valid_1's l2: 0.175321 valid_1's l1: 0.382929 #在訓練過程中繪圖...
畫出特征重要度…
畫出第84顆樹…
總結
以上是生活随笔為你收集整理的LightGBM用法速查表的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: XGBoost的基本使用应用Kaggle
- 下一篇: 金融风控实战——集成学习