决策树完结篇 (二)

2014-11-23 23:55:31 · 作者: · 浏览: 13
et] uniqueva ls = set(featValues) for value in uniqueva ls: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTree def classify(inputTree,featLabels,testVec): firstStr = list(inputTree.keys())[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__=='dict': classLabel = classify(secondDict[key],featLabels,testVec) else: classLabel = secondDict[key] return classLabel myDat,labels = CreateDataSet() print(calcShannonEnt(myDat)) print(splitDataSet(myDat, 1, 1)) print(chooseBestFeatureToSplit(myDat)) myTree = createTree(myDat, labels) print(classify(myTree, labels, [1, 0])) print(classify(myTree, labels, [1, 1])) import math import operator def calcShannonEnt(dataset): numEntries = len(dataset) labelCounts = {} for featVec in dataset: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] +=1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob*math.log(prob, 2) return shannonEnt def CreateDataSet(): dataset = [[1, 1, 'yes' ], [1, 1, 'yes' ], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing', 'flippers'] return dataset, labels def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSplit(dataSet): numberFeatures = len(dataSet[0])-1 baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1; for i in range(numberFeatures): featList = [example[i] for example in dataSet] print(featList) uniqueva ls = set(featList) print(uniqueva ls) newEntropy =0.0 for value in uniqueva ls: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy if(infoGain >
bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature def majorityCnt(classList): classCount ={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]=1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def createTree(dataSet, inputlabels): labels=inputlabels[:] classList = [example[-1] for example in dataSet] if classList.count(classList[0])==len(classList): return classList[0] if len(dataSet[0])==1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueva ls = set(featValues) for value in uniqueva ls: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTree def classify(inputTr