資源簡介
Python項(xiàng)目案例開發(fā)從入門到實(shí)戰(zhàn)源代碼第5章 爬蟲應(yīng)用——校園網(wǎng)搜索引擎

代碼片段和文件信息
import?sys
from?collections?import?deque
import?urllib
from?urllib?import?request
import?re
from?bs4?import?BeautifulSoup
import?lxml
import?sqlite3
import?jieba
##safelock=input(‘你確定要重新構(gòu)建約5000篇文檔的詞庫嗎?(y/n)‘)
##if?safelock!=‘y‘:
##????sys.exit(‘終止?!?br/>
url=‘http://www.zut.edu.cn/index/xwdt.htm‘??#‘http://www.zut.edu.cn‘#入口
unvisited=deque()#待爬取鏈接的列表,使用廣度優(yōu)先搜索
visited=set()????#已訪問的鏈接集合
unvisited.append(url)
#unvisited.append(‘http://www.zut.edu.cn/index/xwdt.htm‘)
conn=sqlite3.connect(‘viewsdu.db‘)
c=conn.cursor()
#在create?table之前先drop?table是因?yàn)槲抑皽y試的時(shí)候已經(jīng)建過table了,所以再次運(yùn)行代碼的時(shí)候得把舊的table刪了重新建
c.execute(‘drop?table?doc‘)
c.execute(‘create?table?doc?(id?int?primary?keylink?text)‘)
c.execute(‘drop?table?word‘)
c.execute(‘create?table?word?(term?varchar(25)?primary?keylist?text)‘)
conn.commit()
conn.close()
print(‘***************開始!***************************************************‘)
cnt=0
print(‘開始。。。。。?‘?)
while?unvisited:
????url=unvisited.popleft()
????visited.add(url)
????cnt+=1
????print(‘開始抓取第‘cnt‘個(gè)鏈接:‘url)
????#爬取網(wǎng)頁內(nèi)容
????try:
????????response=request.urlopen(url)
????????content=response.read().decode(‘utf-8‘)
????????
????except:
????????continue
????#尋找下一個(gè)可爬的鏈接,因?yàn)樗阉鞣秶蔷W(wǎng)站內(nèi),所以對鏈接有格式要求,這個(gè)格式要求根據(jù)具體情況而定
????#解析網(wǎng)頁內(nèi)容可能有幾種情況這個(gè)也是根據(jù)這個(gè)網(wǎng)站網(wǎng)頁的具體情況寫的
????soup=BeautifulSoup(content‘lxml‘)
????all_a=soup.find_all(‘a(chǎn)‘{‘class‘:“c67214“})???#本頁面所有的新聞鏈接
????for?a?in?all_a:
????????#print(a.attrs[‘href‘])
????????x=a.attrs[‘href‘]???????????#網(wǎng)址
????????if?re.match(r‘http.+‘x):???#排除是http開頭,而不是http://www.zut.edu.cn網(wǎng)址
????????????if?not?re.match(r‘http\:\/\/www\.zut\.edu\.cn\/.+‘x):
????????????????continue
????????if?re.match(r‘\/info\/.+‘x):???????#“/info/1046/20314.htm“
????????????x=‘http://www.zut.edu.cn‘+x
????????elif?re.match(r‘info/.+‘x)?:???????#“info/1046/20314.htm“
????????????x=‘http://www.zut.edu.cn/‘+x?
????????elif?re.match(r‘\.\.\/info/.+‘x):??#“../info/1046/20314.htm“?
????????????x=‘http://www.zut.edu.cn‘+x[2:]
????????elif?re.match(r‘\.\.\/\.\.\/info/.+‘x):??#“../../info/1046/20314.htm“?
????????????x=‘http://www.zut.edu.cn‘+x[5:]
????????#print(x)
????????if?(x?not?in?visited)?and?(x?not?in?unvisited):????????????
????????????????unvisited.append(x)
????????????????
????a=soup.find(‘a(chǎn)‘{‘class‘:“Next“})???#下一頁
????if?a!=None:
????????x=a.attrs[‘href‘]???????????#網(wǎng)址
????????if?re.match(r‘xwdt\/.+‘x):
????????????x=‘http://www.zut.edu.cn/index/‘+x
????????else:
????????????x=‘http://www.zut.edu.cn/index/xwdt/‘+x
????????if?(x?not?in?visited)?and?(x?not?in?unvisited):????????????
???????????unvisited.append(x)????
????
????title=soup.title
????article=soup.find(‘div‘class_=‘c67215_content‘id=‘vsb_newscontent‘)
????author=soup.find(‘span‘class_=“authorstyle67215“)??#作者
????time=soup.find(‘span‘class_=“timestyle67215“)
????if?title==None?and?article==None?and?author==None:
????????print(‘無內(nèi)容的頁面?!?br/>????????continue
????elif?article==Non
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件???????5604??2018-04-05?16:37??第5章??爬蟲應(yīng)用——校園網(wǎng)搜索引擎\search_engine_build-2.py
?????文件???????2146??2018-08-05?11:13??第5章??爬蟲應(yīng)用——校園網(wǎng)搜索引擎\search_engine_use.py
?????文件????3912704??2018-04-05?16:58??第5章??爬蟲應(yīng)用——校園網(wǎng)搜索引擎\viewsdu?-?副本.db
?????文件????4865024??2018-08-05?11:05??第5章??爬蟲應(yīng)用——校園網(wǎng)搜索引擎\viewsdu.db
?????目錄??????????0??2018-11-07?19:54??第5章??爬蟲應(yīng)用——校園網(wǎng)搜索引擎
-----------?---------??----------?-----??----
??????????????8785478????????????????????5
評(píng)論
共有 條評(píng)論