预测分析·民宿价格预测baseline
生活随笔
收集整理的這篇文章主要介紹了
预测分析·民宿价格预测baseline
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
大家好,我是小澤
預測分析·民宿價格預測比賽是和鯨社區與ChallengeHub聯合舉辦的一場新手賽,本文旨在多角度構建特征工程來幫助選手快速比賽上手。
比賽鏈接
話不多說,直接開!
導入相關庫
import time import lightgbm as lgb import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from sklearn import metrics from sklearn.model_selection import KFold from sklearn.preprocessing import LabelEncoder from catboost import CatBoostRegressor, Pool#讀取訓練集與測試集并構建原始數據 train = pd.read_csv('./訓練集.csv') test = pd.read_csv('./測試集.csv') df_features = train.append(test)填充缺失值并且相關Encoding操作
df_features['洗手間數量'].fillna(-1, inplace=True) df_features['床的數量'].fillna(-1, inplace=True) df_features['臥室數量'].fillna(-1, inplace=True) df_features['房主是否有個人資料圖片'].fillna('na', inplace=True) df_features['房主身份是否驗證'].fillna('na', inplace=True) df_features['房主回復率'].fillna('-1', inplace=True) df_features['房主回復率'] = df_features['房主回復率'].astype(str).apply(lambda x: x.replace('%', '')) df_features['房主回復率'] = df_features['房主回復率'].astype(int) df_features['民宿周邊'].fillna('na', inplace=True) mean_score = df_features['民宿評分'].mean() df_features['民宿評分'].fillna(mean_score, inplace=True) df_features['郵編'].fillna('na', inplace=True)for feat in ['房主是否有個人資料圖片', '房主身份是否驗證', '民宿周邊', '郵編']:lbl = LabelEncoder()lbl.fit(df_features[feat])df_features[feat] = lbl.transform(df_features[feat])def freq_enc(df, col):vc = df[col].value_counts(dropna=True, normalize=True).to_dict()df[f'{col}_freq'] = df[col].map(vc)return dffor feat in ['容納人數', '洗手間數量', '床的數量', '床的類型','臥室數量', '取消條款', '所在城市', '清潔費','房主是否有個人資料圖片', '房主回復率', '是否支持隨即預訂','民宿周邊', '房產類型', '房型', '郵編']:df_features = freq_enc(df_features, feat)對時間特征進行處理
# 時間特征處理 from tqdm import tqdm df_features['首次評論日期'] = pd.to_datetime(df_features['首次評論日期']).values.astype(np.int64) // 10 ** 9 df_features['何時成為房主'] = pd.to_datetime(df_features['何時成為房主']).values.astype(np.int64) // 10 ** 9 df_features['最近評論日期'] = pd.to_datetime(df_features['最近評論日期']).values.astype(np.int64) // 10 ** 9df_features['timestamp_diff1'] = df_features['首次評論日期'] - df_features['何時成為房主'] df_features['timestamp_diff2'] = df_features['最近評論日期'] - df_features['首次評論日期'] df_features['timestamp_diff3'] = df_features['最近評論日期'] - df_features['何時成為房主']def brute_force(df, features, groups):for method in tqdm(['max', 'min', 'mean', 'median', 'std']):for feature in features:for group in groups:df[f'{group}_{feature}_{method}'] = df.groupby(group)[feature].transform(method)return dfdense_feats = ['timestamp_diff1', 'timestamp_diff2', 'timestamp_diff3'] cate_feats = ['房型']df_features = brute_force(df_features, dense_feats, cate_feats)其他簡單業務特征
def f(x):if x>0:return 1else:return 0 df_features['if_bed'] = train['床的數量'].apply(f) df_features['if_bedroom'] = train['臥室數量'].apply(f) df_features['if_wc'] = train['洗手間數量'].apply(f)#交叉衍生特征 df_features['人均床數量'] = df_features['容納人數'] / (df_features['床的數量'] + 1e-3) # 1e-3 是為了避免 zero-divide df_features['人均臥室量'] = df_features['容納人數'] / (df_features['臥室數量'] + 1e-3) df_features['臥室床均量'] = df_features['床的數量'] / (df_features['臥室數量'] + 1e-3) df_features['經緯度平方根'] = (df_features['維度']*df_features['維度'] + df_features['經度']*df_features['經度'])**.5def get_features(df):features = [['人均床數量','人均臥室量'],['臥室床均量','人均臥室量']]for fea in features:df[f'{fea[0]}_{fea[1]}_std'] = df[fea].std(1)df[f'{fea[0]}_{fea[1]}_max'] = df[fea].max(1)df[f'{fea[0]}_{fea[1]}_min'] = df[fea].min(1)df[f'{fea[0]}_{fea[1]}_sub'] = df[fea[0]] - df[fea[1]]#df.loc[df[fea[0]] <= df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 0#df.loc[df[fea[0]] > df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 1 return dfdf_features = get_features(df_features)對“便利設施”特征進行挖掘
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD df_features['便利設施數量']=df_features['便利設施'].apply(lambda x:len(x.lstrip('{').rstrip('}').split(','))) df_features['便利設施'] = df_features['便利設施'].apply(lambda x: x.replace('{', '').replace('}', '').replace('"', '').replace(':', '').replace(',', ' ')) # df_features['便利設施'] = df_features['便利設施'].str.lower()n_components = 12X = list(df_features['便利設施'].values) tfv = TfidfVectorizer(ngram_range=(1,1), max_features=10000) tfv.fit(X) X_tfidf = tfv.transform(X) svd = TruncatedSVD(n_components= n_components) svd.fit(X_tfidf) X_svd = svd.transform(X_tfidf)for i in range(n_components):df_features[f'便利設施_tfidf_{i}'] = X_svd[:, i]獲取特征和標簽數據
df_train = df_features[~df_features['價格'].isnull()] df_train = df_train.reset_index(drop=True) df_test = df_features[df_features['價格'].isnull()]no_features = ['數據ID', '價格', '便利設施'] # 輸入特征列 features = [col for col in df_train.columns if col not in no_features]X = df_train[features] # 訓練集輸入 y = df_train['價格'] # 訓練集標簽 X_test = df_test[features] # 測試集輸入五折Catboost模型
n_fold = 5 folds = KFold(n_splits=n_fold, shuffle=True, random_state=1314)oof = np.zeros(len(X)) prediction = np.zeros(len(X_test)) for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]y_train, y_valid = y[train_index], y[valid_index]cate_features=['房主是否有個人資料圖片','房主身份是否驗證','是否支持隨即預訂','房產類型','房型','if_bed','if_bedroom','if_wc']train_pool = Pool(X_train, y_train, cat_features=cate_features)eval_pool = Pool(X_valid, y_valid, cat_features=cate_features)cbt_model = CatBoostRegressor(iterations=10000, # 注:baseline 提到的分數是用 iterations=60000 得到的,但運行時間有點久learning_rate=0.1, # 注:事實上好幾個 property 在 lr=0.1 時收斂巨慢。后面可以考慮調大eval_metric='SMAPE',use_best_model=True,random_seed=42,logging_level='Verbose',#task_type='GPU',devices='0',gpu_ram_part=0.5,early_stopping_rounds=400)cbt_model.fit(train_pool,eval_set=eval_pool,verbose=1000)y_pred_valid = cbt_model.predict(X_valid)y_pred = cbt_model.predict(X_test)oof[valid_index] = y_pred_valid.reshape(-1, )prediction += y_pred prediction /= n_foldfrom sklearn.metrics import mean_squared_error score = mean_squared_error(oof, df_train['價格'].values, squared=False) print(score)test['價格'] = prediction test[['數據ID', '價格']].to_csv('./sub_cat.csv'.format(score), index=None)最后線上RMSE可以達到5.3以內,目前可以排到top10左右。
本文主要參考了官方的baseline以及恒哥的代碼思路
如果本文可以幫助到大家,歡迎點個關注!
總結
以上是生活随笔為你收集整理的预测分析·民宿价格预测baseline的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 手把手学会 VS Code 快捷任务神技
- 下一篇: Java Selenium3 WebDr