資源簡介
微博用戶評論情感分析python代碼,完整的可運(yùn)行的,(數(shù)據(jù)規(guī)模20w)微博用戶評論情感分析python代碼,完整的可運(yùn)行的
代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
import?os
import?pandas?as?pd
import?nltk
from?tools?import?proc_text?split_train_test?get_word_list_from_data?extract_feat_from_data?cal_acc
from?nltk.text?import?TextCollection
from?sklearn.linear_model?import?LogisticRegression
from?sklearn.multiclass?import?OneVsRestClassifier
from?sklearn.svm?import?LinearSVC
from?keras.models?import?Sequential
from?keras.layers?import?*
from?keras.optimizers?import?SGDAdam
import?keras
dataset_path?=?‘./dataset‘
text_filenames?=?[‘0_simplifyweibo.txt‘?‘1_simplifyweibo.txt‘
??????????????????‘2_simplifyweibo.txt‘?‘3_simplifyweibo.txt‘]
#?原始數(shù)據(jù)的csv文件
output_text_filename?=?‘raw_weibo_text.csv‘
#?清洗好的文本數(shù)據(jù)文件
output_cln_text_filename?=?‘clean_weibo_text.csv‘
#?處理和清洗文本數(shù)據(jù)的時間較長,通過設(shè)置is_first_run進(jìn)行配置
#?如果是第一次運(yùn)行需要對原始文本數(shù)據(jù)進(jìn)行處理和清洗,需要設(shè)為True
#?如果之前已經(jīng)處理了文本數(shù)據(jù),并已經(jīng)保存了清洗好的文本數(shù)據(jù),設(shè)為False即可
is_first_run?=?False
load_np?=?False
def?read_and_save_to_csv():
????“““
????????讀取原始文本數(shù)據(jù),將標(biāo)簽和文本數(shù)據(jù)保存成csv
????“““
????#?存儲所有向量化的Dataframe對象
????#?每個Dataframe對象表示一個文本數(shù)據(jù)
????text_w_label_df_lst?=?[]
????# 循環(huán)獲取每一個微博文本文件名
????for?text_filename?in?text_filenames:
????????#?組合文件路徑
????????text_file?=?os.path.join(dataset_path?text_filename)
????????#?獲取標(biāo)簽,即0?1?2?3
????????label?=?int(text_filename[0])
????????#?讀取文本文件
????????with?open(text_file?‘r‘?encoding=‘utf-8‘)?as?f:
????????????#?將文本字符串按換行符(\n、\r、\r\n)分隔,返回包含每行數(shù)據(jù)的列表
????????????lines?=?f.read().splitlines()
????????# 生成一個向量,[0?0?0?0?....]
????????labels?=?[label]?*?len(lines)
????????#?當(dāng)前文本內(nèi)容的Series對象
????????text_series?=?pd.Series(lines)
????????#?當(dāng)前文本的標(biāo)簽Series對象
????????label_series?=?pd.Series(labels)
????????#?concat合并多個Series對象,返回一個Dataframe對象
????????text_w_label_df?=?pd.concat([label_series?text_series]?axis=1)
????????#?將所有的數(shù)據(jù)集存到同一個列表里
????????text_w_label_df_lst.append(text_w_label_df)
????result_df?=?pd.concat(text_w_label_df_lst?axis=0)
????#?保存成csv文件
????#?指定列名,第一個label,第二個text
????result_df.columns?=?[‘label‘?‘text‘]
????#?將所有數(shù)據(jù)集寫入到本地磁盤文件
????result_df.to_csv(os.path.join(dataset_path?output_text_filename)index=None?encoding=‘utf-8‘)
def?run_main():
????“““
????????主函數(shù)
????“““
????#?1.?數(shù)據(jù)讀取,處理,清洗,準(zhǔn)備
????if?is_first_run:
????????print(‘處理清洗文本數(shù)據(jù)中...‘?end=‘?‘)
????????#?如果是第一次運(yùn)行需要對原始文本數(shù)據(jù)進(jìn)行處理和清洗
????????#?讀取原始文本數(shù)據(jù),將標(biāo)簽和文本數(shù)據(jù)保存成csv
????????read_and_save_to_csv()
????????#?讀取處理好的csv文件,構(gòu)造數(shù)據(jù)集
????????text_df?=?pd.read_csv(os.path.join(dataset_path?output_text_filename)
??????????????????????????????encoding=‘utf-8‘)
????????#?處理文本數(shù)據(jù)
????????text_df[‘text‘]?=?text_df[‘text‘].apply(proc_text)
????????#?過濾空字符串,去掉所有空行部分
????????text_df?=?text_df[text_df[‘text‘]?!=?‘‘]
????????#?保存處理好的文本數(shù)據(jù),文本預(yù)處理結(jié)束
????????text_df.to_csv(os.path.join(dataset_path?output_cln_text_filename)
???????????????????????index=None?encoding=‘utf-8‘)
????????print(‘完成,并保存結(jié)果。‘)
????#?2.?分割訓(xùn)練集、測試集
????print(‘加載處理好的文本數(shù)據(jù)‘)
????clean_text_df?=?pd.read
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-11-04?11:31??weibo\
?????目錄???????????0??2018-11-03?15:18??weibo\.idea\
?????文件?????????159??2018-11-03?15:18??weibo\.idea\encodings.xm
?????文件?????????695??2018-11-03?15:18??weibo\.idea\misc.xm
?????文件?????????262??2018-11-03?15:18??weibo\.idea\modules.xm
?????文件?????????284??2018-11-03?15:18??weibo\.idea\weibo.iml
?????文件????????1610??2018-11-03?15:18??weibo\.idea\workspace.xm
?????目錄???????????0??2018-11-03?20:53??weibo\__pycache__\
?????文件????????3536??2018-11-03?20:53??weibo\__pycache__\tools.cpython-36.pyc
?????文件????????9155??2018-11-03?22:43??weibo\cnn.py
?????目錄???????????0??2018-11-03?14:38??weibo\dataset\
?????文件????58786051??2017-10-19?19:00??weibo\dataset\0_simplifyweibo.txt
?????文件????15152829??2017-10-19?19:00??weibo\dataset\1_simplifyweibo.txt
?????文件????16971885??2017-10-19?19:00??weibo\dataset\2_simplifyweibo.txt
?????文件?????7649621??2017-10-19?19:00??weibo\dataset\3_simplifyweibo.txt
?????文件????36371678??2018-11-03?21:14??weibo\dataset\clean_weibo_text.csv
?????文件????99137761??2018-11-03?20:53??weibo\dataset\raw_weibo_text.csv
?????文件??????????75??2017-10-19?19:00??weibo\dataset\readme.txt
?????文件????????6202??2018-11-03?18:30??weibo\main.py
?????文件?????????971??2018-11-03?21:01??weibo\read.py
?????文件????????6353??2018-11-03?20:53??weibo\tools.py
?????文件????????5463??2017-10-19?19:00??weibo\中文停用詞庫.txt
?????文件????????6038??2017-10-19?19:00??weibo\哈工大停用詞表.txt
?????文件????????8571??2017-10-19?19:00??weibo\四川大學(xué)機(jī)器智能實(shí)驗(yàn)室停用詞庫.txt
評論
共有 條評論