-
大小: 3KB文件類型: .rar金幣: 2下載: 0 次發(fā)布日期: 2021-06-03
- 語(yǔ)言: C/C++
- 標(biāo)簽: 文本分類??特征權(quán)重??TF-IDF??
資源簡(jiǎn)介
該資源屬于代碼類,用C語(yǔ)言和Python實(shí)現(xiàn)了TF-IDF算法,適用于文本分類等特征權(quán)重抽取

代碼片段和文件信息
#-*-?coding:utf-8?-*-
import?math
import?os
import?fileinput
TEXT?=?0??#某類別的文檔數(shù)目
SUCCEED?=?0
Docs?=?[]
Path?=?“G:\文本聚類數(shù)據(jù)集\數(shù)據(jù)集“.decode(‘utf8‘).encode(‘cp936‘)
w_Path?=?“G:\文本聚類數(shù)據(jù)集\特征數(shù)據(jù)集“.decode(‘utf8‘).encode(‘cp936‘)
##計(jì)算權(quán)重函數(shù),tf為某詞在文章中出現(xiàn)的次數(shù),df為包含該詞的文檔數(shù),max文章中出現(xiàn)次數(shù)最多的詞條數(shù)
##返回值為TF-IDF權(quán)重
##把特征向量按權(quán)重進(jìn)行排序
def?sort(termsTF_IDF):
????for?i?in?range(0len(terms)):
????????m?=?i
????????for?j?in?range(i+1len(terms)):
????????????if?TF_IDF[j]>TF_IDF[m]:
????????????????m?=?j
????????if?i!=m:
????????????temp?=?terms[i]
????????????terms[i]?=?terms[m]
????????????terms[m]?=?temp
????????????v?=?TF_IDF[i]
????????????TF_IDF[i]?=?TF_IDF[m]
????????????TF_IDF[m]?=?v
def?save_words(path):
????global?Docs??TEXT
????terms?=?[]
????fp?=?open(path“r“)
????while?True:
????????line?=?fp.readline()
????????if?not?line?:?break
????????terms.append(line)
????fp.close()
????Docs.append(terms)
????TEXT?+=?1
def?GenerateIDF(path):
????global?Docs??TEXT
????terms?=?[]
????IDF?=?[]
????idf?=?0.0
????fp?=?open(path“r“)
????while?True:
????????line?=?fp.readline()
????????if?not?line?:?break
????????flag?=?0
????????for?i?in?range(0len(terms)):
????????????if?line?==?terms[i]:
????????????????flag?=?1
????????if?flag?==?0:
????????????terms.append(line)
????fp.close()
????for?j?in?range(0len(terms)):
????????df?=?0
????????for?i?in?range(0len(Docs)):
????????????flag?=?0
????????????doc?=?Docs[i]
????????????for?k?in?range(0len(doc)):
????????????????if?terms[j]?==?doc[k]:
????????????????????flag?=?1
????????????????????break
????????????if?flag?==?1:
????????????????df?+=?1
????????idf?=?math.log(float(TEXT)/float(df)+0.01)
????????IDF.append(idf)
????return?IDFterms
def?GenerateTF(pathterms):
????all_terms?=?[]
????TF?=?[]
????terms_count?=?len(terms)
????fp?=?open(path“r“)
????while?True:
????????line?=?fp.readline()
????????if?not?line?:?break
????????all_terms.append(line)
????for?i?in?terms:
????????tf?=?0
????????for?j?in?all_terms:
????????????if?i?==?j:
????????????????tf?+=1
????????TF.append(float(tf)/float(terms_count))
????fp.close()
????return?TF
def?save_weight(TFIDFtermspath):
????global?SUCCEED
????TF_IDF?=?[]
????top?=?200
????if?len(TF) ????????top?=?len(TF)
????for?i?in?range(0len(TF)):
????????TF_IDF.append(float(TF[i])*float(IDF[i]))
????fp?=?open(path“w+“)
????sort(termsTF_IDF)
????for?i?in?range(0top):
????????string?=?terms[i].strip()+“?“+str(TF_IDF[i])+‘\n‘
????????fp.write(string)
????????SUCCEED?+=?1
????fp.close()
def?read_dir(pathw_path):
????global?SUCCEED
????file_list?=?[]
????files?=?os.listdir(path)
????print?“please?wait......“
????for?f?in?files:
????????file_list.append(f)
????????r_name?=?path?+?‘\\‘?+?f
????????save_words(r_name)
????print?“sum?of?docs?is:%d“%TEXT
????for?i?in?file_list:
????????print?i
????????name?=?path?+?‘\\‘?+?i
????????w_name?=?w_path?+?‘\\‘?+?i
????????IDFterms
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件???????3457??2015-10-22?22:52??TFIDFMeasure.py
?????文件???????3455??2015-05-25?19:40??DFTF.CPP
-----------?---------??----------?-----??----
?????????????????6912????????????????????2
- 上一篇:c語(yǔ)言 程序填空題題庫(kù)
- 下一篇:矩陣類的運(yùn)算符重載
評(píng)論
共有 條評(píng)論