周志华《机器学习》西瓜书 小白Python学习笔记(三) ———— 第四章 决策树 python代码 预剪枝

Coral ·
更新时间:2024-09-21
· 635 次阅读

周志华《机器学习》西瓜书 小白Python学习笔记(三) ———— 第四章 决策树 python代码 预剪枝

基于西瓜数据集2.0(提取码:esa8)
,选择信息增益作为属性选择指标,建立决策树。
步骤如下:
输入离散变量的取值集合与标签,并读取数据.

D_keys = { '色泽': ['青绿', '乌黑', '浅白'], '根蒂': ['蜷缩', '硬挺', '稍蜷'], '敲声': ['清脆', '沉闷', '浊响'], '纹理': ['稍糊', '模糊', '清晰'], '脐部': ['凹陷', '稍凹', '平坦'], '触感': ['软粘', '硬滑'], } keys = ['是', '否']

根据留出法将数据集分为相斥的测试集与训练集,比例为3:7,为了保证均匀取出好瓜和坏瓜,采用分层抽样,即分别在好瓜和坏瓜的集合中随机取样。

#划分训练集&测试集,留出法,比例为7:3,分层抽样 def traintest(dataSet): dataSet0 = dataSet[dataSet['好瓜'] == '是'] dataSet1 = dataSet[dataSet['好瓜'] == '否'] list0 = dataSet0.sample(frac=0.7) list0 = list0.append(dataSet1.sample(frac=0.7)) rowlist = [] for indexs in list0.index: rowlist.append(indexs) list1 = dataSet.drop(rowlist, axis=0) return list0,list1

编写在生成树时会用到的子函数:

# 叶节点选择其类别为D中样本最多的类 def choose_largest_example(D): count = D['好瓜'].value_counts() return '是' if count['是'] > count['否'] else '否' # 测试决策树的准确率 def test_Tree(Tree, data_test): accuracy = 0 for index, row in data_test.iterrows(): result = dfs_Tree(Tree, row) if result == row['好瓜']: # print(row.values, Tree) accuracy += 1 return accuracy / data_test.shape[0] # 判断D中的样本在A上的取值是否相同 def same_value(D, A): for key in A: if key in D_keys and len(D[key].value_counts()) > 1: return False return True # 计算给定数据集的熵 def calc_Ent(dataSet): numEntries = dataSet['power'].sum() Count = dataSet.groupby('好瓜')['power'].sum() Ent = 0.0 for key in keys: # print(Count[key]) if key not in Count: Ent -= 0.0 else: prob = Count[key] / numEntries Ent -= prob * math.log(prob, 2) return Ent # 计算按key划分的信息增益值 def calc_Gain_D(D, D_no_nan, key, Ent_D): Ent = 0.0 D_size = D['power'].sum() D_nan_size = D_no_nan['power'].sum() for value in D_keys[key]: Dv = D.loc[D[key] == value] Dv_size = Dv['power'].sum() Ent_Dv = calc_Ent(Dv) Ent += Dv_size / D_nan_size * Ent_Dv return D_nan_size / D_size * (Ent_D - Ent) # 生成连续值属性的候选划分点集合T def candidate_T(D, key, n): L = set(D[key]) T = [] a, Sum = 0, 0 for value in L: Sum += value a += 1 if a == n: T.append(Sum / n) a, Sum = 0, 0 if a > 0: T.append(Sum / a) return T # 计算样本D基于划分点t二分后的连续值属性信息增益 def calc_Gain_t(D, D_no_nan, key, t, Ent_D): Ent = 0.0 D_size = D['power'].sum() D_nan_size = D_no_nan['power'].sum() Dv = D.loc[D[key] t] Dv_size = Dv['power'].sum() Ent_Dv = calc_Ent(Dv) Ent += Dv_size / D_nan_size * Ent_Dv return D_nan_size / D_size * (Ent_D - Ent) # 计算样本D基于不同划分点t二分后的连续值属性信息增益,找出最大增益划分点 def calc_Gain_C(D, D_no_nan, key, Ent_D): n = 2 T = candidate_T(D, key, n) max_Gain, max_partition = -1, -1 for t in T: Gain = calc_Gain_t(D, D_no_nan, key, t, Ent_D) if max_Gain < Gain: max_Gain = Gain max_partition = t return max_Gain, max_partition # 从A中选择最优的划分属性值,若为连续值,返回划分点 def choose_best_attribute(D, A): max_Gain, max_partition, partition, best_attr = -1, -1, -1, '' for key in A: # 划分属性为离散属性时 if key in D_keys: D_no_nan = D.loc[pd.notna(D[key])] Ent_D = calc_Ent(D_no_nan) Gain = calc_Gain_D(D, D_no_nan, key, Ent_D) # 划分属性为连续属性时 else: D_no_nan = D.loc[pd.notna(D[key])] Ent_D = calc_Ent(D_no_nan) Gain, partition = calc_Gain_C(D, D_no_nan, key, Ent_D) if max_Gain < Gain: best_attr = key max_Gain = Gain max_partition = partition return best_attr, max_partition

生成树,并进行预剪枝:

# 函数TreeGenerate 递归生成决策树,以下情形导致递归返回 # 1. 当前结点包含的样本全属于一个类别 # 2. 当前属性值为空, 或是所有样本在所有属性值上取值相同,无法划分 # 3. 当前结点包含的样本集合为空,不可划分 def TreeGenerate(D, A): Count = D['好瓜'].value_counts() if len(Count) == 1: return D['好瓜'].values[0] if len(A) == 0 or same_value(D, A): return choose_largest_example(D) node = {} best_attr, partition = choose_best_attribute(D, A) D_size = D.shape[0] # 最优划分属性为离散属性时 if best_attr in D_keys: for value in D_keys[best_attr]: Dv = D.loc[D[best_attr] == value].copy() Dv_size = Dv.shape[0] Dv.loc[pd.isna(Dv[best_attr]), 'power'] = Dv_size / D_size temp1 = test_Tree(choose_largest_example(D),data_test) if Dv.shape[0] == 0: node[value] = choose_largest_example(D) else: new_A = [key for key in A if key != best_attr] node[value] = TreeGenerate(Dv, new_A) temp0 = test_Tree(node[value],data_test) if temp1 > temp0: node[value] = choose_largest_example(D) # 最优划分属性为连续属性时 else: # print(best_attr, partition) # print(D.values) left = D.loc[D[best_attr] <= partition].copy() Dv_size = left.shape[0] left.loc[pd.isna(left[best_attr]), 'power'] = Dv_size / D_size left_key = ' partition].copy() Dv_size = right.shape[0] right.loc[pd.isna(right[best_attr]), 'power'] = Dv_size / D_size right_key = '> ' + str(partition) temp1 = test_Tree(choose_largest_example(D), data_test) if right.shape[0] == 0: node[right_key] = choose_largest_example(D) else: node[right_key] = TreeGenerate(right, A) temp0 = test_Tree(node[right_key], data_test) if temp1 > temp0: node[right_key] = choose_largest_example(D) # plotTree.plotTree(Tree) return {best_attr: node} # 获得下一层子树分支 def get_next_Tree(Tree, key, value): if key not in D_keys: partition = float(list(Tree[key].keys())[0].split(' ')[1]) if value <= partition: value = ' ' + str(partition) return Tree[key][value]

利用树判断:

# 深度优先遍历,判断预测值 def dfs_Tree(Tree, row): if type(Tree).__name__ == 'dict': key = list(Tree.keys())[0] value = row[key] if pd.isnull(value): result = {key: 0 for key in D_keys['好瓜']} for next_key in Tree[key]: next_Tree = Tree[key][next_key] temp = dfs_Tree(next_Tree, row) result[temp] += 1 return '是' if count['是'] > count['否'] else '否' else: next_Tree = get_next_Tree(Tree, key, value) return dfs_Tree(next_Tree, row) else: return Tree

主函数

if __name__ == '__main__': # 读取数据 filename = '/Users/haoranjiang/Documents/machine learning/111111111/tree.txt' dataSet = loadData(filename) dataSet.drop(columns=['编号'], inplace=True) # 考虑缺失值 dataSet['power'] = 1.0 data_train,data_test = traintest(dataSet) # 决策树训练 A = [column for column in data_train.columns if column != '好瓜'] Tree = TreeGenerate(data_train, A) # 决策树测试 print('准确度:',test_Tree(Tree, data_test)*100,'%') print(Tree) River_J777 原创文章 5获赞 12访问量 1547 关注 私信 展开阅读全文
作者:River_J777



python学习 决策 周志华 预剪枝 学习 决策树 机器学习 Python

需要 登录 后方可回复, 如果你还没有账号请 注册新账号