資源簡介
2017年3月16號關(guān)于決策樹的資源上傳錯了,這一份才是決策樹的Python代碼實(shí)現(xiàn),包含詳細(xì)的中文注釋,歡迎下載學(xué)習(xí)。Python版本為2.7.
代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
import?operator
from?math?import?log
‘‘‘創(chuàng)建數(shù)據(jù)集???‘‘‘
def?createData():??
????dataSet=[[11‘yes‘][11‘yes‘][10‘no‘][01‘no‘][01‘no‘]]
????label=[‘no?surfacing‘‘flippers‘]
????return?dataSetlabel
‘‘‘計(jì)算數(shù)據(jù)集的信息熵?(信息熵即指類別標(biāo)簽的混亂程度,值越小越好)‘‘‘
def?calcshan(dataSet):???
????lenDataSet=len(dataSet)
????p={}
????H=0.0
????for?data?in?dataSet:
????????currentLabel=data[-1]??#獲取類別標(biāo)簽
????????if?currentLabel?not?in?p.keys():??#若字典中不存在該類別標(biāo)簽,即創(chuàng)建
????????????p[currentLabel]=0
????????p[currentLabel]+=1????#遞增類別標(biāo)簽的值
????for?key?in?p:
????????px=float(p[key])/float(lenDataSet)??#計(jì)算某個標(biāo)簽的概率
????????H-=px*log(px2)??#計(jì)算信息熵
????return?H
????????
????????????
‘‘‘根據(jù)某一特征分類數(shù)據(jù)集‘‘‘
def?spiltData(dataSetaxisvalue):????#dataSet為要劃分的數(shù)據(jù)集axis為給定的特征,value為給定特征的具體值
????subDataSet=[]
????for?data?in?dataSet:
????????subData=[]
????????if?data[axis]==value:
????????????subData=data[:axis]??#取出data中第0到axis-1個數(shù)進(jìn)subData;
????????????subData.extend(data[axis+1:])??#取出data中第axis+1到最后一個數(shù)進(jìn)subData;這兩行代碼相當(dāng)于把第axis個數(shù)從數(shù)據(jù)集中剔除掉
????????????subDataSet.append(subData)?#此處要注意expend和append的區(qū)別
????return?subDataSet
‘‘‘遍歷所有特征,選擇信息熵最小的特征,即為最好的分類特征‘‘‘??????
def?chooseBestFeature(dataSet):??
????lenFeature=len(dataSet[0])-1????#計(jì)算特征維度時要把類別標(biāo)簽?zāi)且涣腥サ?br/>????shanInit=calcshan(dataSet)??????#計(jì)算原始數(shù)據(jù)集的信息熵
????feature=[]
????inValue=0.0
????bestFeature=0
????for?i?in?range(lenFeature):
????????shanCarry=0.0
????????feature=[example[i]?for?example?in?dataSet]??#提取第i個特征的所有數(shù)據(jù)
????????feature=set(feature)??#得到第i個特征所有的分類值,如‘0‘和‘1‘
????????for?feat?in?feature:??
????????????subData=spiltData(dataSetifeat)??#先對數(shù)據(jù)集按照分類值分類
????????????prob=float(len(subData))/float(len(dataSet))
????????????shanCarry+=prob*calcshan(subData)??#計(jì)算第i個特征的信息熵
????????outValue=shanInit-shanCarry??#原始數(shù)據(jù)信息熵與循環(huán)中的信息熵的差
????????if?(outValue>inValue):
????????????inValue=outValue??#將信息熵與原始熵相減后的值賦給inValue,方便下一個循環(huán)的信息熵差值與其比較
????????????bestFeature=i
????return?bestFeature
??
‘‘‘?如果數(shù)據(jù)集已經(jīng)處理了所有屬性,但是類標(biāo)簽依然不是唯一時使用,采用多數(shù)表決的方法定義該節(jié)點(diǎn)的分類‘‘‘
def?majorCount(classList):
????classCount={}
????for?vote?in?classList:
????????if?vote?not?in?classCount.keys():??#若字典中不存在該類別標(biāo)簽,即創(chuàng)建
????????????classCount[vote]=0
????????classCount[vote]+=1????????????????#遞增類別標(biāo)簽的值
????sortedClassCount=sorted(classCount.iterit
評論
共有 條評論