最近在打kaggle,这里做一下xgboost笔记,这里不讲原理,只讲xgboost库的使用,以及一些参数调节。
传送门: xgboost参数说明
用泰坦尼克号这个经典例子来说:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
train = pd.read_csv("datasets/titanic_train.csv")
test = pd.read_csv("datasets/titanic_test.csv")
#数据清洗
def clean(titanic):
titanic["age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic["child"] = titanic["Age"].apply(lambda x:1 if x<15 else 0)#是否是小孩
titanic["sex"] = titanic["Sex"].apply(lambda x:1 if x=="male" else 0)
titanic["Embarked"] = titanic["Embarked"].fillna("S")
def embark(Embark):
if Embark == "S":
return 1
elif Embark == "C":
return 2
else:
return 3
titanic["embarked"] = titanic["Embarked"].apply(embark)
titanic["family"] = titanic["SibSp"]+titanic["Parch"]+1
titanic["cabin"] = titanic["Cabin"].apply(lambda x: 0 if x=="N" else 1)
def getname(Name):
if "Mrs" in str(Name):
return 2
elif "Mr" in str(Name):
return 1
else:
return 0
titanic["name"] = titanic["Name"].apply(getname)
titanic["fare"] = titanic["Fare"].fillna(titanic["Fare"].median())
return titanic
train_data = clean(train)
test_data = clean(test)
features = ["Pclass","sex","child","family","fare","embarked","cabin"]
clf = XGBClassifier(learning_rate=0.1,max_depth=2,silent=True,objective='binary:logistic')
param_test = {
'n_estimators': [30,32,34,36,38,40,42,44,46,48,50],
'max_depth': [2,3,4,5,6,7]
}#大杀器XGBoost
grid_search = GridSearchCV(estimator=clf , param_grid=param_test , scoring='accuracy',cv=5)
grid_search.fit(train[features],train["Survived"])
grid_search.grid_scores_,grid_search.best_params_,grid_search.best_score_
predict_data = grid_search.predict(test[features])
在xgboost中使用交叉验证
为了防止过拟合,需要交叉验证
import xgboost as xgb
from Utils import pathUtils
import pandas as pd
from sklearn.model_selection import train_test_split
from Process.genBasicData import genData
from Utils import feaUtils
train_data = genData(pathUtils.train_path)
test_data = genData(pathUtils.test_path)
param = {'max_depth': 3,
'learning_rate ': 0.01,
'silent': 1,
'objective': 'binary:logistic',
"eval_metric":"auc",
"scale_pos_weight":10,
"subsample":0.8,
"min_child_weight":1,
}
features = [i for i in list(train_data.columns) if i not in ["ID","y"]]
dtrain = xgb.DMatrix(train_data[features],label=train_data['y'])
dtest = xgb.DMatrix(test_data[features])
cv_res= xgb.cv(param,dtrain,num_boost_round=2000,early_stopping_rounds=30,nfold=10, metrics='auc',show_stdv=True)
print(cv_res)
#cv_res.shape[0]为最佳迭代次数
bst = xgb.train(param,dtrain,num_boost_round=cv_res.shape[0])
y_pre = bst.predict(dtest)
res = pd.concat([test_data[["ID"]],pd.DataFrame(y_pre,columns=["pred"])],axis=1)
res.to_csv(pathUtils.predict_root_path+"cv_res.csv",index=False)