資源簡介
python網(wǎng)絡爬蟲獲取去哪兒網(wǎng)景點信息源碼,獲取的景點信息有'景點', '景點類別', '景點級別', '地點', '經(jīng)度', '緯度', '開放時間', '景點介紹', '評論次數(shù)', '游客評分', '熱度', '關鍵詞', '圖片路徑'。內(nèi)有詳細注釋。
代碼片段和文件信息
#?-!-?coding:?utf-8?-!-
from?bs4?import?BeautifulSoup
from?urllib.parse?import?*
import?urllib
import?urllib.request
import?re
import?string
import?time
import?codecs
import?csv
import?jieba
import?jieba.analyse
from?optparse?import?OptionParser
hotnum=re.compile(r‘\d(\d)*‘)
def?getHotNum(cNumgrade):
????if?(int(cNum)>=50000):
????????hotNum?=?50+?10?*?float(grade)
????else:
????????hotNum=?int(cNum)/1000?+?10?*?float(grade)
????return?hotNum
#初始化Http請求
url_base?=‘http://piao.qunar.com‘#?‘http://piao.qunar.com/ticket/list.htm?‘
user_agent?=?‘Mozilla/5.0?(Windows?NT?10.0;?WOW64;?rv:58.0)‘
header?=?{‘User-Agent‘:?user_agent}
ak?=?‘whSDgmRhKopIDFMCGxj21FcY611b6R9h‘
#將結果存入csv文件
csvfile?=?open(‘畢設測試數(shù)據(jù).csv‘?‘a(chǎn)+‘?encoding=‘utf-8‘?newline=‘‘)
writer?=?csv.writer(csvfile)
#writer.writerow([‘景點‘?‘景點類別‘?‘景點級別‘?‘地點‘?‘經(jīng)度‘?‘緯度‘?‘開放時間‘?‘景點介紹‘?‘評論次數(shù)‘?‘游客評分‘?‘熱度‘?‘關鍵詞‘?‘圖片路徑‘])
#請求網(wǎng)頁
pageIndex=38?#請求頁序號
while?True:??####
????if?pageIndex?==?1:????#首頁(默認)
????????#url?=?‘http://piao.qunar.com/ticket/list.htm?keyword=中國‘
????????url?=?‘http://piao.qunar.com/ticket/list.htm?keyword=%E7%83%AD%E9%97%A8%E6%99%AF%E7%82%B9®ion=&from=mpl_search_suggest&subject=文化古跡&page=1‘
????????url?=?quote(url?safe=string.printable)??#編碼?問題
????elif?pageIndex>2:????#限制爬取頁數(shù)
????????break
????else:?#第pageIndex頁
????????print(pageIndex)
????????#url?=?‘http://piao.qunar.com/ticket/list.htm?keyword=中國&page={0}‘.format(pageIndex)
????????url?=?‘http://piao.qunar.com/ticket/list.htm?keyword=%E7%83%AD%E9%97%A8%E6%99%AF%E7%82%B9®ion=&from=mpl_search_suggest&subject=文化古跡&page={}‘.format(pageIndex)
????????url?=?quote(url?safe=string.printable)
????#使用urlib庫請求網(wǎng)頁pageCode
????request?=?urllib.request.Request(urlheaders?=?header)
????response?=?urllib.request.urlopen(request)
????html?=?response.read().decode(‘utf-8‘‘ignore‘)
????#構造soup對象
????soup?=?BeautifulSoup(html?‘html.parser‘)
????#獲取該頁所有的新聞鏈接
????a?=?soup.find_all(‘div‘‘result_list‘)#search-list
????#print?(a)
????soup_news?=?BeautifulSoup(a.__str__()?‘html.parser‘)
????#print(a.__str__())
????links?=?soup_news.find_all(‘a(chǎn)‘‘sight_item_do‘)#(‘a(chǎn)‘)
????#print(links)
????#初始化結果數(shù)組和景點序號
????results?=?[]?#保存景點結果(9個字段??景點名稱,地點,景點開放時間,景點介紹??熱度等)
????i?=?0;??#第幾個景點
????#遍歷新聞鏈接列表
????for?item?in?links:
????????#構造景點詳情頁面鏈接
????????href?=?links[0][‘href‘]
????????#print(href)
????????href?=?url_base?+?href[0:]
????????links=links[1:]
????????#print(links)
????????#print(href)
????????#?保存景點鏈接
????????#results.append(href)
????????#請求景點內(nèi)容頁面
????????request?=?urllib.request.Request(href?headers=header)
????????response?=?urllib.request.urlopen(request)
????????html?=?response.read().decode(‘utf-8‘)
????????soup_content?=?BeautifulSoup(html?‘html.parser‘)?#景點內(nèi)容
????????name=?soup_content.find(‘span‘‘mp-description-name‘).string
????????results.append(name)??#保存景點名稱
????????results.append(‘文化古跡‘)????
評論
共有 條評論