LightGBM用法速查表

Nona ·
更新时间:2024-09-21
· 860 次阅读

内置方式建模

1.要把数据读取成Dataset格式
2.lgb.train去训练

# coding: utf-8 import json import lightgbm as lgb import pandas as pd from sklearn.metrics import mean_squared_error # 加载数据集合 print('加载数据...') df_train = pd.read_csv('./data/regression.train.txt', header=None, sep='\t') df_test = pd.read_csv('./data/regression.test.txt', header=None, sep='\t') # 设定训练集和测试集 y_train = df_train[0].values y_test = df_test[0].values X_train = df_train.drop(0, axis=1).values X_test = df_test.drop(0, axis=1).values # 构建lgb中的Dataset格式 lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # 敲定好一组参数 params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'l2', 'auc'}, 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 } print('开始训练...') # 训练 gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=5) # 保存模型 print('保存模型...') # 保存模型到文件中 gbm.save_model('model.txt') print('开始预测...') # 预测 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # 评估 print('预估结果的rmse为:') print(mean_squared_error(y_test, y_pred) ** 0.5) 添加样本权重训练 # coding: utf-8 import json import lightgbm as lgb import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error import warnings warnings.filterwarnings("ignore") # 加载数据集 print('加载数据...') df_train = pd.read_csv('./data/binary.train', header=None, sep='\t') df_test = pd.read_csv('./data/binary.test', header=None, sep='\t') W_train = pd.read_csv('./data/binary.train.weight', header=None)[0] W_test = pd.read_csv('./data/binary.test.weight', header=None)[0] y_train = df_train[0].values y_test = df_test[0].values X_train = df_train.drop(0, axis=1).values X_test = df_test.drop(0, axis=1).values num_train, num_feature = X_train.shape # 加载数据的同时加载权重 lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False) # 设定参数 params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 } # 产出特征名称 feature_name = ['feature_' + str(col) for col in range(num_feature)] print('开始训练...') gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, # 评估训练集 feature_name=feature_name, categorical_feature=[21]) 模型的载入与预测 # 查看特征名称 print('完成10轮训练...') print('第7个特征为:') print(repr(lgb_train.feature_name[6])) # 存储模型 gbm.save_model('./model/lgb_model.txt') # 特征名称 print('特征名称:') print(gbm.feature_name()) # 特征重要度 print('特征重要度:') print(list(gbm.feature_importance())) # 加载模型 print('加载模型用于预测') bst = lgb.Booster(model_file='./model/lgb_model.txt') # 预测 y_pred = bst.predict(X_test) # 在测试集评估效果 print('在测试集上的rmse为:') print(mean_squared_error(y_test, y_pred) ** 0.5) 接着之前的模型继续训练 # 继续训练 # 从./model/model.txt中加载模型初始化 gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model='./model/lgb_model.txt', valid_sets=lgb_eval) print('以旧模型为初始化,完成第 10-20 轮训练...') # 在训练的过程中调整超参数 # 比如这里调整的是学习率 gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, learning_rates=lambda iter: 0.05 * (0.99 ** iter), valid_sets=lgb_eval) print('逐步调整学习率完成第 20-30 轮训练...') # 调整其他超参数 gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, valid_sets=lgb_eval, callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)]) print('逐步调整bagging比率完成第 30-40 轮训练...') 自定义损失函数 # 类似在xgboost中的形式 # 自定义损失函数需要 def loglikelood(preds, train_data): labels = train_data.get_label() preds = 1. / (1. + np.exp(-preds)) grad = preds - labels hess = preds * (1. - preds) return grad, hess # 自定义评估函数 def binary_error(preds, train_data): labels = train_data.get_label() return 'error', np.mean(labels != (preds > 0.5)), False gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, fobj=loglikelood, feval=binary_error, valid_sets=lgb_eval) print('用自定义的损失函数与评估标准完成第40-50轮...')
作者:小菜鸡一号



速查表 lightgbm

需要 登录 后方可回复, 如果你还没有账号请 注册新账号