xxxx18一60岁hd中国/日韩女同互慰一区二区/西西人体扒开双腿无遮挡/日韩欧美黄色一级片 - 色护士精品影院www

資源簡(jiǎn)介

該資源屬于代碼類,用C語(yǔ)言和Python實(shí)現(xiàn)了TF-IDF算法,適用于文本分類等特征權(quán)重抽取

資源截圖

代碼片段和文件信息

#-*-?coding:utf-8?-*-
import?math
import?os
import?fileinput

TEXT?=?0??#某類別的文檔數(shù)目
SUCCEED?=?0
Docs?=?[]

Path?=?“G:\文本聚類數(shù)據(jù)集\數(shù)據(jù)集“.decode(‘utf8‘).encode(‘cp936‘)
w_Path?=?“G:\文本聚類數(shù)據(jù)集\特征數(shù)據(jù)集“.decode(‘utf8‘).encode(‘cp936‘)

##計(jì)算權(quán)重函數(shù),tf為某詞在文章中出現(xiàn)的次數(shù),df為包含該詞的文檔數(shù),max文章中出現(xiàn)次數(shù)最多的詞條數(shù)
##返回值為TF-IDF權(quán)重

##把特征向量按權(quán)重進(jìn)行排序
def?sort(termsTF_IDF):
????for?i?in?range(0len(terms)):
????????m?=?i
????????for?j?in?range(i+1len(terms)):
????????????if?TF_IDF[j]>TF_IDF[m]:
????????????????m?=?j
????????if?i!=m:
????????????temp?=?terms[i]
????????????terms[i]?=?terms[m]
????????????terms[m]?=?temp
????????????v?=?TF_IDF[i]
????????????TF_IDF[i]?=?TF_IDF[m]
????????????TF_IDF[m]?=?v

def?save_words(path):
????global?Docs??TEXT
????terms?=?[]
????fp?=?open(path“r“)
????while?True:
????????line?=?fp.readline()
????????if?not?line?:?break
????????terms.append(line)
????fp.close()
????Docs.append(terms)
????TEXT?+=?1

def?GenerateIDF(path):
????global?Docs??TEXT
????terms?=?[]
????IDF?=?[]
????idf?=?0.0
????fp?=?open(path“r“)
????while?True:
????????line?=?fp.readline()
????????if?not?line?:?break
????????flag?=?0
????????for?i?in?range(0len(terms)):
????????????if?line?==?terms[i]:
????????????????flag?=?1
????????if?flag?==?0:
????????????terms.append(line)
????fp.close()
????for?j?in?range(0len(terms)):
????????df?=?0
????????for?i?in?range(0len(Docs)):
????????????flag?=?0
????????????doc?=?Docs[i]
????????????for?k?in?range(0len(doc)):
????????????????if?terms[j]?==?doc[k]:
????????????????????flag?=?1
????????????????????break
????????????if?flag?==?1:
????????????????df?+=?1
????????idf?=?math.log(float(TEXT)/float(df)+0.01)
????????IDF.append(idf)
????return?IDFterms

def?GenerateTF(pathterms):
????all_terms?=?[]
????TF?=?[]
????terms_count?=?len(terms)
????fp?=?open(path“r“)
????while?True:
????????line?=?fp.readline()
????????if?not?line?:?break
????????all_terms.append(line)
????for?i?in?terms:
????????tf?=?0
????????for?j?in?all_terms:
????????????if?i?==?j:
????????????????tf?+=1
????????TF.append(float(tf)/float(terms_count))
????fp.close()
????return?TF

def?save_weight(TFIDFtermspath):
????global?SUCCEED
????TF_IDF?=?[]
????top?=?200
????if?len(TF)????????top?=?len(TF)
????for?i?in?range(0len(TF)):
????????TF_IDF.append(float(TF[i])*float(IDF[i]))
????fp?=?open(path“w+“)
????sort(termsTF_IDF)
????for?i?in?range(0top):
????????string?=?terms[i].strip()+“?“+str(TF_IDF[i])+‘\n‘
????????fp.write(string)
????????SUCCEED?+=?1
????fp.close()

def?read_dir(pathw_path):
????global?SUCCEED
????file_list?=?[]
????files?=?os.listdir(path)
????print?“please?wait......“
????for?f?in?files:
????????file_list.append(f)
????????r_name?=?path?+?‘\\‘?+?f
????????save_words(r_name)
????print?“sum?of?docs?is:%d“%TEXT
????for?i?in?file_list:
????????print?i
????????name?=?path?+?‘\\‘?+?i
????????w_name?=?w_path?+?‘\\‘?+?i
????????IDFterms

?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----

?????文件???????3457??2015-10-22?22:52??TFIDFMeasure.py

?????文件???????3455??2015-05-25?19:40??DFTF.CPP

-----------?---------??----------?-----??----

?????????????????6912????????????????????2


評(píng)論

共有 條評(píng)論