1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
|
import operator import pickle from collections import Counter from math import log
from tree_plotter import create_plot
def create_data_set(): """ 数据集: 1.必须是一种由列元素组成的列表,而且所有列表元素均具有相同的数据长度 2.数据的最后一列或者每一个实例的最后一个元素是当前实例的类别标签 :return: """ data_set = [ [1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no'] ] labels = ['no surfacing', 'flippers'] return data_set, labels
def calc_shannon_entropy(data_set): num_entries = len(data_set) label_counts = {} for feat_vector in data_set: current_label = feat_vector[-1] if current_label not in label_counts.keys(): label_counts[current_label] = 0 label_counts[current_label] += 1 shannon_entropy = 0.0 for key in label_counts: probability = label_counts[key] / num_entries shannon_entropy -= probability * log(probability, 2) return shannon_entropy
def calc_shannon_entropy2(data_set): """ 通过列表推导式,及Counter,实现香农熵的计算 :param data_set: :return: """ class_count = [sample[-1] for sample in data_set] length = len(data_set) class_count = Counter(class_count) shannon_entropy = 0. for times in class_count.values(): shannon_entropy -= times / length * log(times / length, 2) return shannon_entropy
def split_data_set(data_set, axis, value): """ 划分数据集 :param data_set: 待划分的数据集 :param axis: 划分数据集的特征 :param value: 需要返回的特征的值 :return: """ """ append和extend的区别 >>> a = [1,2,3] >>> b = [4,5,6] >>> a.append(b) >>> a [1, 2, 3, [4, 5, 6]] >>> a.extend(b) >>> a [1, 2, 3, [4, 5, 6], 4, 5, 6] """ divided_data_set = [] for feature_vector in data_set: if feature_vector[axis] == value: reduced_feature_vector = feature_vector[:axis] reduced_feature_vector.extend(feature_vector[axis + 1:]) divided_data_set.append(reduced_feature_vector) return divided_data_set
def choose_best_feature2split(data_set): num_features = len(data_set[0]) - 1 base_entropy = calc_shannon_entropy2(data_set) best_info_gain = 0. best_feature = -1 for i in range(num_features): feature_list = [example[i] for example in data_set] unique_vals = set(feature_list) new_entropy = 0. for value in unique_vals: sub_data_set = split_data_set(data_set, i, value) probability = len(sub_data_set) / len(data_set) new_entropy += probability * calc_shannon_entropy2(sub_data_set) info_gain = base_entropy - new_entropy if info_gain > best_info_gain: best_info_gain = info_gain best_feature = i return best_feature
def majority_counter(class_list): """ 如果数据集已经处理了所有属性,但是类标签依然不是唯一的,我们需要决定如何定义该叶子节点 此时,我们采用多数表决的方式,决定该叶子节点的分类 :param class_list: :return: """ class_count = {} for vote in class_list: if vote not in class_count.keys(): class_count[vote] = 0 class_count[vote] += 1 sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True) return sorted_class_count[0][0]
def classify(input_tree, feature_labels, test_vector): first_str = list(input_tree.keys())[0] second_dict = input_tree[first_str] feature_index = feature_labels.index(first_str) class_label = None for key in second_dict.keys(): if test_vector[feature_index] == key: if type(second_dict[key]).__name__ == 'dict': class_label = classify(second_dict[key], feature_labels, test_vector) else: class_label = second_dict[key] return class_label
def store_tree(input_tree, filename): with open(filename, 'wb') as fw: pickle.dump(input_tree, fw, 0)
def grab_tree(filename): with open(filename, 'rb') as read: return pickle.load(read)
def create_tree(data_set, labels): class_list = [example[-1] for example in data_set] if class_list.count(class_list[0]) == len(class_list): return class_list[0] if len(data_set[0]) == 1: return majority_counter(class_list) best_feature = choose_best_feature2split(data_set) best_feature_label = labels[best_feature] my_tree = {best_feature_label: {}} del (labels[best_feature]) feature_values = [example[best_feature] for example in data_set] unique_values = set(feature_values) for value in unique_values: sub_labels = labels[:] my_tree[best_feature_label][value] = create_tree(split_data_set(data_set, best_feature, value), sub_labels) return my_tree
if __name__ == '__main__': with open('resource/lenses.txt') as fr: lenses = [instance.strip().split('\t') for instance in fr.readlines()] lenses_labels = ['age', 'prescript', 'astigmatic', 'tearRate'] lenses_tree = create_tree(lenses, lenses_labels) print(lenses) print(lenses_labels) print(lenses_tree) create_plot(lenses_tree)
|