终于看完了决策树生成,和测试的代码,感觉还是非常有收获的,于是总结下决策树相关的东西,决策树说白了就是利用事物已知属性来构建对事物进行判定,划分数据的方式在前面的文章中已经进行了介绍,这里就不多说了,因为前面都没有给出如何利用自己构建的决策树来对新添加的数据进行测试,所以下面给出决策代码:
def classify(inputTree,featLabels,testVec):
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__=='dict':
classLabel = classify(secondDict[key],featLabels,testVec)
else:
classLabel = secondDict[key]
return classLabel
def classify(inputTree,featLabels,testVec):
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__=='dict':
classLabel = classify(secondDict[key],featLabels,testVec)
else:
classLabel = secondDict[key]
return classLabel
吼吼,这个点单的测试代码就是完成对给定数据进行分类决策的。其实就是对整棵树进行遍历,直到到达叶子节点。 同样给出程序的运行截图: 当然为了保险起见:我还是给出全部的源码,方便没有看前几篇的童鞋直接对其运行,操作和修改成自己的代码。 import math import operator def calcShannonEnt(dataset): numEntries = len(dataset) labelCounts = {} for featVec in dataset: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] +=1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob*math.log(prob, 2) return shannonEnt def CreateDataSet(): dataset = [[1, 1, 'yes' ], [1, 1, 'yes' ], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing', 'flippers'] return dataset, labels def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSplit(dataSet): numberFeatures = len(dataSet[0])-1 baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1; for i in range(numberFeatures): featList = [example[i] for example in dataSet] print(featList) uniqueva ls = set(featList) print(uniqueva ls) newEntropy =0.0 for value in uniqueva ls: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy if(infoGain >bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature def majorityCnt(classList): classCount ={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]=1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def createTree(dataSet, inputlabels): labels=inputlabels[:] classList = [example[-1] for example in dataSet] if classList.count(classList[0])==len(classList): return classList[0] if len(dataSet[0])==1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataS