資源簡介
基于樸素貝葉斯的垃圾郵件分類 對垃圾郵件的分類有較好的效果 達到99%

代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
import?numpy?as?np
def?textParser(text):
????“““
????對SMS預處理,去除空字符串,并統一小寫
????:param?text:
????:return:
????“““
????import?re
????regEx?=?re.compile(r‘[^a-zA-Z]|\d‘)??#?匹配非字母或者數字,即去掉非字母和數字,只留下單詞
????words?=?regEx.split(text)
????#?去除空字符串,并統一小寫
????words?=?[word.lower()?for?word?in?words?if?len(word)?>?0]
????return?words
def?loadSMSData(fileName):
????“““
????加載SMS數據
????:param?fileName:
????:return:
????“““
????f?=?open(fileName)
????classCategory?=?[]??#?類別標簽,1表示是垃圾SMS,0表示正常SMS
????smsWords?=?[]
????for?line?in?f.readlines():
????????linedatas?=?line.strip().split(‘\t‘)
????????if?linedatas[0]?==?‘ham‘:
????????????classCategory.append(0)
????????elif?linedatas[0]?==?‘spam‘:
????????????classCategory.append(1)
????????#?切分文本
????????words?=?textParser(linedatas[1])
????????smsWords.append(words)
????return?smsWords?classCategory
def?createVocabularyList(smsWords):
????“““
????創建語料庫
????:param?smsWords:
????:return:
????“““
????vocabularySet?=?set([])
????for?words?in?smsWords:
????????vocabularySet?=?vocabularySet?|?set(words)
????vocabularyList?=?list(vocabularySet)
????return?vocabularyList
def?getVocabularyList(fileName):
????“““
????從詞匯列表文件中獲取語料庫
????:param?fileName:
????:return:
????“““
????fr?=?open(fileName)
????vocabularyList?=?fr.readline().strip().split(‘\t‘)
????fr.close()
????return?vocabularyList
def?setOfWordsToVecTor(vocabularyList?smsWords):
????“““
????SMS內容匹配預料庫,標記預料庫的詞匯出現的次數
????:param?vocabularyList:
????:param?smsWords:
????:return:
????“““
????vocabMarked?=?[0]?*?len(vocabularyList)
????for?smsWord?in?smsWords:
????????if?smsWord?in?vocabularyList:
????????????vocabMarked[vocabularyList.index(smsWord)]?+=?1
????return?vocabMarked
def?setOfWordsListToVecTor(vocabularyList?smsWordsList):
????“““
????將文本數據的二維數組標記
????:param?vocabularyList:
????:param?smsWordsList:
????:return:
????“““
????vocabMarkedList?=?[]
????for?i?in?range(len(smsWordsList)):
????????vocabMarked?=?setOfWordsToVecTor(vocabularyList?smsWordsList[i])
????????vocabMarkedList.append(vocabMarked)
????return?vocabMarkedList
def?trainingNaiveBayes(trainMarkedWords?trainCategory):
????“““
????訓練數據集中獲取語料庫中詞匯的spamicity:P(Wi|S)
????:param?trainMarkedWords:?按照語料庫標記的數據,二維數組
????:param?trainCategory:
????:return:
????“““
????numTrainDoc?=?len(trainMarkedWords)
????numWords?=?len(trainMarkedWords[0])
????#?是垃圾郵件的先驗概率P(S)
????pSpam?=?sum(trainCategory)?/?float(numTrainDoc)
????#?統計語料庫中詞匯在S和H中出現的次數
????wordsInSpamNum?=?np.ones(numWords)
????wordsInHealthNum?=?np.ones(numWords)
????spamWordsNum?=?2.0
????healthWordsNum?=?2.0
????for?i?in?range(0?numTrainDoc):
????????if?trainCategory[i]?==?1:??#?如果是垃圾SMS或郵件
????????????wordsInSpamNum?+=?trainMarkedWords[i]
????????????spamWordsNum?+=?sum(trainMarkedWords[i])??#?統計Spam中語料庫中詞匯出現的總次數
????????else:
????????????wordsInHealthNum?+=?trainMarkedWords[i]
????????????healthWo
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????5111??2018-05-07?16:36??SMS\NaiveBayes\NaiveBayes.py
?????文件???????4842??2017-04-22?10:48??SMS\NaiveBayes\NaiveBayes.pyc
?????文件?????????14??2017-04-19?16:33??SMS\NaiveBayes\pSpam.txt
?????文件?????210195??2017-04-19?16:33??SMS\NaiveBayes\pWordsHealthy.txt
?????文件?????210195??2017-04-19?16:33??SMS\NaiveBayes\pWordsSpamicity.txt
?????文件?????198723??2017-04-22?13:55??SMS\NaiveBayes\ROC?Curve.png
?????文件???????3876??2017-04-22?13:53??SMS\NaiveBayes\SenSpeciList0.csv
?????文件???????3876??2017-04-22?13:53??SMS\NaiveBayes\SenSpeciList1.csv
?????文件???????3876??2017-04-22?13:54??SMS\NaiveBayes\SenSpeciList2.csv
?????文件???????3876??2017-04-22?13:54??SMS\NaiveBayes\SenSpeciList3.csv
?????文件???????3952??2017-04-22?13:55??SMS\NaiveBayes\SenSpeciList4.csv
?????文件?????477907??2011-03-15?22:36??SMS\NaiveBayes\SMSSpamCollection.txt
?????文件???????3239??2018-05-07?16:36??SMS\NaiveBayes\test.py
?????文件???????1942??2017-04-20?11:15??SMS\NaiveBayes\test.pyc
?????文件????????802??2018-05-07?16:36??SMS\NaiveBayes\TestPlot.py
?????文件???????1141??2018-05-07?16:36??SMS\NaiveBayes\training.py
?????文件??????54677??2017-04-19?16:33??SMS\NaiveBayes\vocabularyList.txt
?????文件?????????58??2017-04-19?16:32??SMS\NaiveBayes\__init__.py
?????目錄??????????0??2017-04-23?08:39??SMS\NaiveBayes
?????目錄??????????0??2017-04-23?08:39??SMS
-----------?---------??----------?-----??----
??????????????1188302????????????????????20
評論
共有 條評論