数据挖掘TASK4_建模调参

Kate ·

更新时间:2024-11-13

· 507 次阅读

建模与调参

学习目标
掌握机器学习模型的建模与调参过程

内容介绍

线性回归模型：

线性回归对于特征的要求；

处理长尾分布；

理解线性回归模型；

模型性能验证：

评价函数与目标函数；

交叉验证方法；

留一验证方法；

针对时间序列问题的验证；

绘制学习率曲线；

绘制验证曲线；

嵌入式特征选择：

Lasso回归；

Ridge回归；

决策树；

模型对比：

常用线性模型；

常用非线性模型；

模型调参：

贪心调参方法；

网格调参方法；

贝叶斯调参方法；

代码示例


import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


#定义reduce_men_usage函数，通过调整数据类型帮助我们减少数据所占内存空间
def reduce_mem_usage(df):
    start_men = df.memory_usage().sum()
    print('memory usage of dataframe is {:.2f} MB'.format(start_men))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max  np.iinfo(np.int16).min and c_max  np.iinfo(np.int32).min and c_max  np.iinfo(np.int64).min and c_max  np.finfo(np.float16).min and c_max  np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_men = df.memory_usage().sum()
    print('Memory usage after optimization is :{:.2f} MB'.format(end_men))
    print('Decreased by {:.1f}%'.format(100*(start_men - end_men)/start_men))
    return df
sample_feature = reduce_mem_usage(pd.read_csv('data_for_tree.csv'))
#sample_feature.head()

memory usage of dataframe is 62099624.00 MB
Memory usage after optimization is :16520255.00 MB
Decreased by 73.4%


continuous_feature_names = [x for x in sample_feature.columns if x not in ['price','brand','model','brand']]
print(continuous_feature_names)
sample_feature = sample_feature.dropna().replace('-', 0).reset_index(drop=True)
sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)
train = sample_feature[continuous_feature_names + ['price']]
train_x = train[continuous_feature_names]
train_y = train['price']

['SaleID', 'name', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'seller', 'offerType', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'train', 'used_time', 'city', 'brand_amount', 'brand_price_average', 'brand_price_max', 'brand_price_median', 'brand_price_min', 'brand_price_std', 'brand_price_sum', 'power_bin']


#train_x.head()


train_y.head()

0    1850.0
1    6222.0
2    5200.0
3    8000.0
4    3500.0
Name: price, dtype: float64


#1、简单建模,训练线性回归模型，查看截距与权重
from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)
model = model.fit(train_x, train_y)
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)

[('v_6', 3367064.341641952),
 ('v_8', 700675.5609398864),
 ('v_9', 170630.27723221222),
 ('v_7', 32322.661932025392),
 ('v_12', 20473.670796989394),
 ('v_3', 17868.07954151005),
 ('v_11', 11474.938996718518),
 ('v_13', 11261.764560017724),
 ('v_10', 2683.920090609242),
 ('gearbox', 881.8225039249613),
 ('fuelType', 363.90425072161565),
 ('bodyType', 189.60271012074494),
 ('city', 44.9497512052328),
 ('power', 28.55390161675131),
 ('brand_price_median', 0.5103728134078974),
 ('brand_price_std', 0.45036347092632434),
 ('brand_amount', 0.1488112039506708),
 ('brand_price_max', 0.0031910186703149753),
 ('SaleID', 5.3559899198567324e-05),
 ('seller', 2.4531036615371704e-06),
 ('train', 4.246830940246582e-07),
 ('offerType', -7.235445082187653e-06),
 ('brand_price_sum', -2.175006868187898e-05),
 ('name', -0.00029800127130847845),
 ('used_time', -0.0025158943328449923),
 ('brand_price_average', -0.40490484510113794),
 ('brand_price_min', -2.246775348688707),
 ('power_bin', -34.42064411726649),
 ('v_14', -274.7841180776088),
 ('kilometer', -372.897526660709),
 ('notRepairedDamage', -495.19038446298714),
 ('v_0', -2045.0549573540754),
 ('v_5', -11022.986240523212),
 ('v_4', -15121.731109858125),
 ('v_2', -26098.29992055678),
 ('v_1', -45556.189297264835)]


from matplotlib import pyplot as plt
subsample_index = np.random.randint(low=0, high=len(train_y),size=50)#随机抽取50个点验证
plt.scatter(train_x['v_9'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_x['v_9'][subsample_index], model.predict(train_x.loc[subsample_index]), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price is obvious different from true price')
plt.show()

The predicted price is obvious different from true price
![在这里插入图片描述](https://img-blog.csdnimg.cn/20200329134510981.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80Mzk1OTI0OA==,size_16,color_FFFFFF,t_70)