二手车交易价格预测学习笔记 -- Task5

Odessa ·
更新时间:2024-09-21
· 990 次阅读

赛题:零基础入门数据挖掘 - 二手车交易价格预测
地址:https://tianchi.aliyun.com/competition/entrance/231784/introduction?spm=5176.12281957.1004.1.38b02448ausjSX

模型融合 常用方法 from sklearn.datasets import make_blobs #聚类数据生成 from sklearn import datasets #提供数据集 from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier from xgboost import XGBClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split #随机划分训练集、测试集 from sklearn.datasets import make_moons #生成月亮形数据集 from sklearn.metrics import accuracy_score,roc_auc_score #准确率评估方法 from sklearn.model_selection import cross_val_score #可以K折交叉验证 from sklearn.model_selection import StratifiedKFold #分段K折交叉验证 多模型投票 VotingClassifier() 生成用0填充的数组 np.zeros() sklearn 预测概率矩阵 predict_porba() 模型融合 blending 多折模型融合 stacking 其他集成学习方法 bagging boosting import itertools #迭代器 import matplotlib.gridspec as gridspec #非对称子图 from sklearn.neighbors import KNeighborsClassifier #k近邻分类器 from sklearn.naive_bayes import GaussianNB #朴素贝叶斯 from mlxtend.classifier import StackingClassifier from mlxtend.plotting import plot_learning_curves #学习曲线 from mlxtend.plotting import plot_decision_regions #决策边界 嵌套循环 itertools.product() 也可以生成特征作为新特征 from sklearn import preprocessing #数据预处理 # 降维方法: # 主成分分析 PCA 独立成分分析 FastICA 因子分析 FactorAnalysis 稀疏主成分分析 SparsePCA from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA import lightgbm as lgb import xgboost as xgb # 均方差 MSE 平均绝对误差 MAE from sklearn.metrics import mean_squared_error, mean_absolute_error 根据数据类型选择特征列 select_dtypes() 小结 Stacking的第二层模型不宜过于复杂; Blending容易过拟合(第二层只用了TrainSet10%的量),但避免了信息泄露; Stacking案例,用鸢尾花数据集,KNN、RandomForest、NaiveBayes作第一层,LogisticRegression作第二层,效果提升明显; 融合

XGBoost的五折交叉回归验证试验

## xgb xgr = xgb.XGBRegressor(n_estimators=120, learning_rate=0.1, subsample=0.8,\ colsample_bytree=0.9, max_depth=7) # ,objective ='reg:squarederror' scores_train = [] scores = [] ## 5折交叉验证方式 sk=StratifiedKFold(n_splits=5,shuffle=True,random_state=0) for train_ind,val_ind in sk.split(X_data,Y_data): train_x=X_data.iloc[train_ind].values train_y=Y_data.iloc[train_ind] val_x=X_data.iloc[val_ind].values val_y=Y_data.iloc[val_ind] xgr.fit(train_x,train_y) pred_train_xgb=xgr.predict(train_x) pred_xgb=xgr.predict(val_x) score_train = mean_absolute_error(train_y,pred_train_xgb) scores_train.append(score_train) score = mean_absolute_error(val_y,pred_xgb) scores.append(score) print('Train mae:',np.mean(score_train)) print('Val mae',np.mean(scores))

Stacking融合

## 第一层 train_lgb_pred = model_lgb.predict(x_train) train_xgb_pred = model_xgb.predict(x_train) train_gbdt_pred = model_gbdt.predict(x_train) Strak_X_train = pd.DataFrame() Strak_X_train['Method_1'] = train_lgb_pred Strak_X_train['Method_2'] = train_xgb_pred Strak_X_train['Method_3'] = train_gbdt_pred Strak_X_val = pd.DataFrame() Strak_X_val['Method_1'] = val_lgb Strak_X_val['Method_2'] = val_xgb Strak_X_val['Method_3'] = val_gbdt Strak_X_test = pd.DataFrame() Strak_X_test['Method_1'] = subA_lgb Strak_X_test['Method_2'] = subA_xgb Strak_X_test['Method_3'] = subA_gbdt ## level2-method model_lr_Stacking = build_model_lr(Strak_X_train,y_train) ## 训练集 train_pre_Stacking = model_lr_Stacking.predict(Strak_X_train) print('MAE of Stacking-LR:',mean_absolute_error(y_train,train_pre_Stacking)) ## 验证集 val_pre_Stacking = model_lr_Stacking.predict(Strak_X_val) print('MAE of Stacking-LR:',mean_absolute_error(y_val,val_pre_Stacking)) ## 预测集 print('Predict Stacking-LR...') subA_Stacking = model_lr_Stacking.predict(Strak_X_test) subA_Stacking[subA_Stacking<10]=10 ## 去除过小的预测值 sub = pd.DataFrame() sub['SaleID'] = TestA_data.SaleID sub['price'] = subA_Stacking sub.to_csv('./sub_Stacking.csv',index=False)
作者:weixin_45727892



学习笔记 二手车 学习

需要 登录 后方可回复, 如果你还没有账号请 注册新账号