資源簡(jiǎn)介
TFIDF是經(jīng)典的算法,可以進(jìn)行文本相似度計(jì)算和文檔聚類,值得研究

代碼片段和文件信息
/************************************************
*?《WEB數(shù)據(jù)挖掘與知識(shí)發(fā)現(xiàn)》試驗(yàn)報(bào)告實(shí)現(xiàn)程序?????*
*?功能:采用TFIDF自動(dòng)對(duì)文本進(jìn)行形式化(題目6)??*
*?????????????*
*?時(shí)間:2008.2.28???????????????????????????????*
************************************************/
#include
#include
#include
#include
#define?FNUM??20???//使用的文件總個(gè)數(shù)
struct?Ttree
{
char?data[20];
double?weight;
double?num;???????????//一篇文獻(xiàn)中的某一索引詞出現(xiàn)的次數(shù)
double?max;???????????//一篇文獻(xiàn)的總字?jǐn)?shù)
double?n;?????????????//索引詞出現(xiàn)在幾個(gè)文檔中
struct?Ttree?*lchild;?//左兒子
struct?Ttree?*rchild;?//右兒子
};
struct?Ttree?*rootW=NULL;
struct?Ttree?*mtree=NULL*ntree=NULL*rtree=NULL;??//定義weight權(quán)值排序函數(shù)中間變量****3月2日增加
FILE?*fp=fopen(“mm.txt““w“);
//創(chuàng)建二叉樹用來(lái)存放單詞,以及該詞在文檔中出現(xiàn)的次數(shù)
Ttree?*createTtree(Ttree?*rootFILE?*fp){
int?i=0t=0;
struct?Ttree?*p*q;??????????????//定義中間指針變量
char?ch;
p=(Ttree*)malloc(sizeof(Ttree));?//申請(qǐng)新的存儲(chǔ)空間
p->data[0]=‘\0‘;
p->max=0;????????????????????????//**************3月1日增加
if(fp==NULL)
{
printf(“\nCannot?open?file?strike?any?key?exit!“);
return?NULL;
}
ch=fgetc(fp);
while((ch!=EOF)&&(t==0))
{?
if((ch>=‘a(chǎn)‘&&ch<=‘z‘)||(ch>=‘A‘&&ch<=‘Z‘)){
if(ch<=‘Z‘)?ch=ch+32;
p->data[i]=ch;
i++;
}
else
{
if(p->data[0]==‘\0‘){
ch=fgetc(fp);
continue;
}
p->data[i]=‘\0‘;
p->max++;
p->n=1;
p->num=1;
i=0;
t=1;
p->lchild=NULL;
p->rchild=NULL;???????????//初始化頭節(jié)點(diǎn)的左右兒子為空指針
root=p;
}
ch=fgetc(fp);
}
????q=(Ttree*)malloc(sizeof(Ttree));
????q->data[0]=‘\0‘?;
while(ch!=EOF){
if(?(ch>=‘a(chǎn)‘&&ch<=‘z‘)?||?(ch>=‘A‘&&ch<=‘Z‘)?)?{
if(ch<=‘Z‘)?ch=ch+32;
q->data[i]=ch;
i++;
ch=fgetc(fp);
}
????????else{
if(q->data[0]==‘\0‘)
{
ch=fgetc(fp);
continue;
}
q->data[i]=‘\0‘;
root->max++;
q->n=1;
q->num=1;
i=0;
q->lchild=NULL;
q->rchild=NULL;????????????????????//初始化頭節(jié)點(diǎn)的左右兒子為空指針
if(p==NULL)p=root;
ch=fgetc(fp);
while(p!=NULL)?????????????????????//尋找待插入節(jié)點(diǎn)的位置
{
if(strcmp(q->datap->data)<0){?//如果待插入的節(jié)點(diǎn)的值小于當(dāng)前節(jié)點(diǎn)的值,
if(p->lchild==NULL)????????//且其左子樹為空
{
p->lchild=q;???????????//??則插入
p=NULL;
}??????????????????????????//并置當(dāng)前節(jié)點(diǎn)為空,退出當(dāng)前的while循環(huán)
else
p=p->lchild;
}?//?否則繼續(xù)訪問(wèn)其左子樹
else?if(strcmp(q->datap->data)>0){?//如果待插入的節(jié)點(diǎn)的值大于當(dāng)前節(jié)點(diǎn)的值
if(p->rchild==NULL)?????????????//?且其右子樹為空
{
p->rchild=q;????????????????//??則插入
p=NULL;
}?//并置當(dāng)前節(jié)點(diǎn)為空,退出當(dāng)前的while循環(huán)
else
p=p->rchild;
}?//?否則繼續(xù)訪問(wèn)其右子樹
else{
p->num++;
p=NULL;
}
}//while
????????????q=(Ttree*)malloc(sizeof(Ttree));
????????????q->data[0]=‘\0‘;
}//else
}//while
return?root;
}
/*
二叉樹查找
計(jì)算某個(gè)詞在幾篇文檔中出現(xiàn)
*/
Ttree?*SearchBinTtree(Ttree?*rootxTtree?*rooty){
if(rootx==NULL)?return?NULL;
if(strcmp(rootx->datarooty->data)==0){
rooty->n++;
return?rootx;
}
if(strcmp(rootx->datarooty->data)>0)?return?S
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件????????712??2015-06-01?19:48??tfidfsrc\Debug\cl.command.1.tlog
?????文件???????1876??2015-06-01?19:48??tfidfsrc\Debug\CL.read.1.tlog
?????文件????????340??2015-06-01?19:48??tfidfsrc\Debug\CL.write.1.tlog
?????文件?????500224??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.exe
?????文件????????406??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.exe.em
?????文件????????472??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.exe.em
?????文件????????381??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.exe.intermediate.manifest
?????文件????1074516??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.ilk
?????文件?????????64??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.lastbuildstate
?????文件???????5090??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.log
?????文件??????20703??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.obj
?????文件????1993728??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.pdb
?????文件????????707??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.vcxprojResolveAssemblyReference.cache
?????文件??????????0??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.write.1.tlog
?????文件????????236??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval_manifest.rc
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件???????1586??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件???????3214??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件????????840??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件????????450??2015-06-01?19:48??tfidfsrc\Debug\mt.command.1.tlog
?????文件????????330??2015-06-01?19:48??tfidfsrc\Debug\mt.read.1.tlog
?????文件????????330??2015-06-01?19:48??tfidfsrc\Debug\mt.write.1.tlog
?????文件????????630??2015-06-01?19:48??tfidfsrc\Debug\rc.command.1.tlog
?????文件????????302??2015-06-01?19:48??tfidfsrc\Debug\rc.read.1.tlog
?????文件????????310??2015-06-01?19:48??tfidfsrc\Debug\rc.write.1.tlog
............此處省略43個(gè)文件信息
評(píng)論
共有 條評(píng)論