??我們以2022年全國服務外包大賽的A03題目作為示例代碼演示缺失值填補過程。
??問題的主要任務時找出商品的銷量異常和價格異常,提供4個月的商品信息數據,共1700萬余條,4個月的店鋪信息數據,共60萬余條,強調時間復雜度空間復雜度、異常值識別率和準確率。我們用店鋪分析輔助商品的異常,以提高可信度和準確率。但是店鋪主要業務中存在較多缺失,對之后衍生變量計算有較大影響。
??店鋪部分數據鏈接:https://pan.baidu.com/s/1iAp-s2JwG_YTB35BevMNyQ 提取碼:jhnb
??個人認為,缺失值填補本質上是一個預測問題,因此,在隨機森立算法效果不佳的情況下,我們采取一個AutoML工具hyperGBM(中文使用說明:https://hypergbm.readthedocs.io/zh_CN/latest/example.html)在10個模型中選取效果最好的模型預測缺失值。事實上這是一個分類問題,我們借助店鋪的業務范圍預測店鋪的主要業務。因為店鋪的業務范圍長短不同,所以我們根據店鋪的范圍的長度將數據集分為13個部分,分別對這13個部分進行模型訓練填補缺失值。在整體流程中這屬于第一步缺失值填補:
import numpy
as np
import pandas
as pdpd
.set_option
('display.max_columns', None)
from sklearn
.model_selection
import train_test_split
from sklearn
.preprocessing
import MinMaxScaler
, StandardScaler
from sklearn
.metrics
import get_scorer
, classification_report
from hypergbm
import make_experiment
import os
import pickle
def normalize(file, group
):min_max
= MinMaxScaler
(feature_range
=(0, 1))ret
= fileret
[file.columns
[group
]] = min_max
.fit_transform
(file[file.columns
[group
]])return ret
def make_number(df_file
):dic_main_bussiness_labels
= df_file
.value_counts
("MAIN_BUSINESS").to_dict
() no
= 0 for key
, value
in dic_main_bussiness_labels
.items
():dic_main_bussiness_labels
[key
] = nono
+= 1df_file
["MAIN_BUSINESS"] = [dic_main_bussiness_labels
[name
] for name
in df_file
["MAIN_BUSINESS"]] list_main_scope
= list() set_main_scope
= set() for name
in df_file
["BUSINESS_SCOPE"]:try:new
= name
.split
(",") except:new
= str(name
).split
(",") while "" in new
: new
.remove
("") list_main_scope
.append
(new
)set_main_scope
.update
(new
) set_main_scope_num
= {i
for i
in range(len(set_main_scope
))} dic_mainscope_labels
= dict(zip(set_main_scope
, set_main_scope_num
)) list_num_main_scope
= list() count
= list() for item
in list_main_scope
:temp
= []for bussiness
in item
:temp
.append
(dic_mainscope_labels
[bussiness
])list_num_main_scope
.append
(temp
)count
.append
(len(temp
))df_file
["BUSINESS_SCOPE"] = list_num_main_scopedf_file
["count"]=count
return df_file
.drop
(axis
=1, columns
="Unnamed: 0"), dic_main_bussiness_labels
, dic_mainscope_labels
def make_number_Nan(df_Nan
, dict_Scope
): list_main_scope
= []index_insetead
= [i
for i
in range(df_Nan
.shape
[0])]df_Nan
.index
= index_insetead
for name
in df_Nan
["BUSINESS_SCOPE"]:try:new
= name
.split
(",")except:new
= str(name
).split
(",") list_main_scope
.append
(new
)list_num_main_scope
= list()count
= list()hema_index
= list() for i
, item
in enumerate(list_main_scope
):temp
= []for bussiness
in item
:try:temp
.append
(dict_Scope
[bussiness
])except: if(bussiness
!=""):temp
.append
(16) hema_index
.append
(i
)list_num_main_scope
.append
(temp
)count
.append
(len(temp
))df_Nan
["BUSINESS_SCOPE"] = list_num_main_scopedf_Nan
["count"] = count
for i
in hema_index
: df_Nan
.at
[i
, "MAIN_BUSINESS"] = 33df_hema
= df_Nan
[df_Nan
["MAIN_BUSINESS"] == 33]df_Nan
= df_Nan
.drop
(df_Nan
.index
[hema_index
]) return df_Nan
.drop
(axis
=1, columns
="Unnamed: 0"), df_hema
.drop
(axis
=1, columns
="Unnamed: 0")def fill_null(df_no_Nan
, df_has_Nan
, dict_BussinessScope
):print("預測數據長度:\n", df_has_Nan
["count"].value_counts
()) df_predict_result
= df_no_Nan
for i
in range(1, 14): col_names
= ["col"+str(col_name
) for col_name
in range(i
)] train_data
= df_no_Nan
[df_no_Nan
["count"] == i
][["BUSINESS_SCOPE", "MAIN_BUSINESS"]] train_data
= train_data
.groupby
("MAIN_BUSINESS").filter(lambda x
: (len(x
) >= 5))print(train_data
.value_counts
("MAIN_BUSINESS"))predict_data
= df_has_Nan
[df_has_Nan
["count"] == i
]df_train_devided
= train_data
['BUSINESS_SCOPE'].apply(pd
.Series
, index
=col_names
) df_predict_devided
= predict_data
['BUSINESS_SCOPE'].apply(pd
.Series
, index
=col_names
)train_data
= pd
.concat
([df_train_devided
, train_data
.drop
(columns
="BUSINESS_SCOPE", axis
=1)], axis
=1) predict_data_test
= pd
.concat
([df_predict_devided
, predict_data
[["BUSINESS_SCOPE", "MAIN_BUSINESS"]].drop
(columns
="BUSINESS_SCOPE", axis
=1)], axis
=1) try:x_train
, y_train
= train_test_split
(train_data
, random_state
=1129, test_size
=0.2, stratify
=train_data
["MAIN_BUSINESS"])except:x_train
, y_train
= train_test_split
(train_data
, random_state
=1129, test_size
=0.2)try:y_train
, z_train
= train_test_split
(y_train
, random_state
=1129, test_size
=0.5, stratify
=y_train
["MAIN_BUSINESS"])except:y_train
, z_train
= train_test_split
(y_train
, random_state
=1129, test_size
=0.5)x_train
.to_csv
("train.csv", encoding
="utf-8-sig")y_train
.to_csv
("eval.csv", encoding
="utf-8-sig")predict_data_test
.to_csv
("test.csv", encoding
="utf-8-sig")exp
= make_experiment
("train.csv", test_data
=None, eval_data
="eval.csv", target
='MAIN_BUSINESS', reward_metric
='accuracy', log_level
='info', class_balancing
='ClassWeight', cv
=True) estiamtor
= exp
.run
() with open('model'+str(i
)+' .pkl', 'wb') as f
:pickle
.dump
(estiamtor
, f
)print("完事!!\n\n")z_pred
= estiamtor
.predict
(z_train
.drop
(axis
=1, columns
="MAIN_BUSINESS"))print(classification_report
(z_train
["MAIN_BUSINESS"].tolist
(), pd
.Series
(z_pred
, index
=z_train
.index
), digits
=5))pred_proba
= estiamtor
.predict_proba
(predict_data_test
)result
= np
.argmax
(pred_proba
, axis
=1)predict_data
["MAIN_BUSINESS"] = result predict_data
.to_csv
("填補后店鋪數據"+str(i
)+".csv", encoding
="utf-8-sig")df_predict_result
= pd
.concat
([df_predict_result
, predict_data
], axis
=0) os
.remove
("test.csv")os
.remove
("train.csv")os
.remove
("eval.csv")df_predict_result
.to_csv
("填補后店鋪數據.csv", encoding
="utf-8-sig")return df_predict_result
def main():df_file
= pd
.read_csv
("../new feature/店鋪數據.csv", encoding
="utf-8-sig")df_NoNan
= df_file
.dropna
(axis
=0, how
='any', subset
=["MAIN_BUSINESS"])df_has_Nan
= df_file
[df_file
[["MAIN_BUSINESS"]].isnull
().T
.any()]df_has_Nan
.dropna
(axis
=0, how
='any', subset
=["BUSINESS_SCOPE"])[["MAIN_BUSINESS", "BUSINESS_SCOPE"]]df_NoNan_numbered
, dict_MainBussiness
, dict_BussinessScope
= make_number
(df_NoNan
)df_has_Nan_numbered
, df_hema
= make_number_Nan
(df_has_Nan
, dict_BussinessScope
)df_NoNan_numbered
= pd
.concat
([df_NoNan_numbered
, df_hema
]) dict_MainBussiness
.update
(zip({"盒馬"}, {33})) dict_BussinessScope
.update
(zip({"盒馬"}, {16}))print("主要業務編號詞典:\n", dict_MainBussiness
)print("業務范圍詞典:\n", dict_BussinessScope
)dict_numbers_to_MainBussiness
= dict([value
, key
] for key
, value
in dict_MainBussiness
.items
()) dict_numbers_to_BussinessScope
= dict([value
, key
] for key
, value
in dict_BussinessScope
.items
())print("向量化完畢!!!")df_fill_null
= fill_null
(df_NoNan_numbered
, df_has_Nan_numbered
, dict_BussinessScope
)print("缺失值填補完畢!!!")
??后面幾組數據由于維度較高且數據量較少,所以預測的準確率較低,因此準確率較低,因為數據較少,我們采取人工手動標注的方法填補缺失值。
??模型調參結果圖:
相關文章
??數據概覽與預處理https://blog.csdn.net/Hjh1906008151/article/details/124313507
??衍生變量計算(缺失值填補就是為了計算衍生變量)https://blog.csdn.net/Hjh1906008151/article/details/124330708
??異常值識別基礎方法https://blog.csdn.net/Hjh1906008151/article/details/124342492
??基于pyod的異常值識別方法https://editor.csdn.net/md/?articleId=124340047
??異常值識別效果不佳的解決思路https://blog.csdn.net/Hjh1906008151/article/details/124341064
總結
以上是生活随笔為你收集整理的基于自动机器学习工具hyperGBM的异常值识别中缺失值填补问题(含2022年全国服务外包大赛实例)的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。