赛题:零基础入门数据挖掘 - 二手车交易价格预测
地址:https://tianchi.aliyun.com/competition/entrance/231784/introduction?spm=5176.12281957.1004.1.38b02448ausjSX
from sklearn.datasets import make_blobs #聚类数据生成
from sklearn import datasets #提供数据集
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split #随机划分训练集、测试集
from sklearn.datasets import make_moons #生成月亮形数据集
from sklearn.metrics import accuracy_score,roc_auc_score #准确率评估方法
from sklearn.model_selection import cross_val_score #可以K折交叉验证
from sklearn.model_selection import StratifiedKFold #分段K折交叉验证
多模型投票 VotingClassifier()
生成用0填充的数组 np.zeros()
sklearn 预测概率矩阵 predict_porba()
模型融合 blending 多折模型融合 stacking
其他集成学习方法 bagging boosting
import itertools #迭代器
import matplotlib.gridspec as gridspec #非对称子图
from sklearn.neighbors import KNeighborsClassifier #k近邻分类器
from sklearn.naive_bayes import GaussianNB #朴素贝叶斯
from mlxtend.classifier import StackingClassifier
from mlxtend.plotting import plot_learning_curves #学习曲线
from mlxtend.plotting import plot_decision_regions #决策边界
嵌套循环 itertools.product()
也可以生成特征作为新特征
from sklearn import preprocessing #数据预处理
# 降维方法:
# 主成分分析 PCA 独立成分分析 FastICA 因子分析 FactorAnalysis 稀疏主成分分析 SparsePCA
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA
import lightgbm as lgb
import xgboost as xgb
# 均方差 MSE 平均绝对误差 MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error
根据数据类型选择特征列 select_dtypes()
小结
Stacking的第二层模型不宜过于复杂;
Blending容易过拟合(第二层只用了TrainSet10%的量),但避免了信息泄露;
Stacking案例,用鸢尾花数据集,KNN、RandomForest、NaiveBayes作第一层,LogisticRegression作第二层,效果提升明显;
融合
XGBoost的五折交叉回归验证试验
## xgb
xgr = xgb.XGBRegressor(n_estimators=120, learning_rate=0.1, subsample=0.8,\
colsample_bytree=0.9, max_depth=7) # ,objective ='reg:squarederror'
scores_train = []
scores = []
## 5折交叉验证方式
sk=StratifiedKFold(n_splits=5,shuffle=True,random_state=0)
for train_ind,val_ind in sk.split(X_data,Y_data):
train_x=X_data.iloc[train_ind].values
train_y=Y_data.iloc[train_ind]
val_x=X_data.iloc[val_ind].values
val_y=Y_data.iloc[val_ind]
xgr.fit(train_x,train_y)
pred_train_xgb=xgr.predict(train_x)
pred_xgb=xgr.predict(val_x)
score_train = mean_absolute_error(train_y,pred_train_xgb)
scores_train.append(score_train)
score = mean_absolute_error(val_y,pred_xgb)
scores.append(score)
print('Train mae:',np.mean(score_train))
print('Val mae',np.mean(scores))
Stacking融合
## 第一层
train_lgb_pred = model_lgb.predict(x_train)
train_xgb_pred = model_xgb.predict(x_train)
train_gbdt_pred = model_gbdt.predict(x_train)
Strak_X_train = pd.DataFrame()
Strak_X_train['Method_1'] = train_lgb_pred
Strak_X_train['Method_2'] = train_xgb_pred
Strak_X_train['Method_3'] = train_gbdt_pred
Strak_X_val = pd.DataFrame()
Strak_X_val['Method_1'] = val_lgb
Strak_X_val['Method_2'] = val_xgb
Strak_X_val['Method_3'] = val_gbdt
Strak_X_test = pd.DataFrame()
Strak_X_test['Method_1'] = subA_lgb
Strak_X_test['Method_2'] = subA_xgb
Strak_X_test['Method_3'] = subA_gbdt
## level2-method
model_lr_Stacking = build_model_lr(Strak_X_train,y_train)
## 训练集
train_pre_Stacking = model_lr_Stacking.predict(Strak_X_train)
print('MAE of Stacking-LR:',mean_absolute_error(y_train,train_pre_Stacking))
## 验证集
val_pre_Stacking = model_lr_Stacking.predict(Strak_X_val)
print('MAE of Stacking-LR:',mean_absolute_error(y_val,val_pre_Stacking))
## 预测集
print('Predict Stacking-LR...')
subA_Stacking = model_lr_Stacking.predict(Strak_X_test)
subA_Stacking[subA_Stacking<10]=10 ## 去除过小的预测值
sub = pd.DataFrame()
sub['SaleID'] = TestA_data.SaleID
sub['price'] = subA_Stacking
sub.to_csv('./sub_Stacking.csv',index=False)