資源簡介
python爬蟲爬取微博熱搜
代碼片段和文件信息
#-*-?coding?=?utf-8?-*-
#@Time?:?2020/12/16?14:37
#@Author?:?wy
#@File?:?spider.py
#@Software?:?PyCharm
‘‘‘
實現思路
1.頁面分析,找到頁面的url,找到數據的位置
2.數據抓取,通過request庫的get請求拿到html源碼
3.數據解析,通過lxml庫的xpath語法提取所需要的數據
4.數據存儲,使用with?open將數據進行寫入
‘‘‘
#引入第三方庫,需要安裝
import?requests????????????????#數據抓取庫
from?lxml?import?etree?????????#數據解析庫
import?time????????????????????#內置函數,時間庫
#時間格式化
today?=?time.strftime(
????‘%Y{y}%m{m}%dhwpimomelus‘time.localtime()).format(y=‘年‘m=‘月‘d=‘日‘)
print(today)
#數據抓取
url?=?“https://s.weibo.com/top/summary?cate=realtimehot“?????#熱搜地址
headers?={
“User-Agent“:?“Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/87.0.4280.88?Safari/537.36?Edg/87.0.664.60“
}????????#headers偽裝頭
response?=?requests.get(urlheaders=headers)????????#發送請求
#print(response.text)????#獲取html源碼
#數據解析
html?=?etree.HTML(response.text)??????#類型轉換
#先找到上一級標簽,然后在下面進行多次提取,使用for循環
datas?=?html.xpath(‘//*[@id=“pl_top_realtimehot“]/table/tbody/tr‘)???????#由一定的路徑
for?data?in?datas:????#循環多次提取
????data_title?=?‘‘.join(data.xpath(‘td[2]/a/text()‘))?????#熱搜標題
????data_rank?=?‘‘.join(data.xpath(‘td[1]/text()‘))????????#熱搜排名
????data_num?=?‘‘.join(data.xpath(‘td[2]/span/text()‘))
????print(data_rankdata_titledata_num)
????#數據存儲,文件名是當天的日期
????with?open(“./20201228‘.txt‘“‘a‘encoding=‘utf-8‘)as?f:
????????f.write(“%s\t%s%s\n“%(data_rankdata_titledata_num))
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????184??2020-12-16?14:39??weibo\.idea\.gitignore
?????文件????????174??2020-12-16?14:39??weibo\.idea\inspectionProfiles\profiles_settings.xm
?????文件????????410??2020-12-16?14:39??weibo\.idea\inspectionProfiles\Project_Default.xm
?????文件????????302??2020-12-16?14:39??weibo\.idea\misc.xm
?????文件????????269??2020-12-16?14:39??weibo\.idea\modules.xm
?????文件????????361??2020-12-16?14:39??weibo\.idea\weibo.iml
?????文件???????6060??2020-12-28?23:33??weibo\.idea\workspace.xm
?????文件???????1870??2020-12-28?23:33??weibo\20201228‘.txt‘
?????文件???????1819??2020-12-28?23:33??weibo\spider.py
?????文件???????2176??2020-12-16?14:58??weibo\venv\Lib\site-packages\beautifulsoup4-4.9.3.dist-info\AUTHORS
?????文件???????1315??2020-12-16?14:58??weibo\venv\Lib\site-packages\beautifulsoup4-4.9.3.dist-info\COPYING.txt
?????文件??????????4??2020-12-16?14:58??weibo\venv\Lib\site-packages\beautifulsoup4-4.9.3.dist-info\INSTALLER
?????文件???????1447??2020-12-16?14:58??weibo\venv\Lib\site-packages\beautifulsoup4-4.9.3.dist-info\LICENSE
?????文件???????4190??2020-12-16?14:58??weibo\venv\Lib\site-packages\beautifulsoup4-4.9.3.dist-info\me
?????文件???????3121??2020-12-16?14:58??weibo\venv\Lib\site-packages\beautifulsoup4-4.9.3.dist-info\RECORD
?????文件??????????0??2020-12-16?14:58??weibo\venv\Lib\site-packages\beautifulsoup4-4.9.3.dist-info\REQUESTED
?????文件??????????4??2020-12-16?14:58??weibo\venv\Lib\site-packages\beautifulsoup4-4.9.3.dist-info\top_level.txt
?????文件?????????92??2020-12-16?14:58??weibo\venv\Lib\site-packages\beautifulsoup4-4.9.3.dist-info\WHEEL
?????文件??????18748??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\builder\_html5lib.py
?????文件??????18405??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\builder\_htmlparser.py
?????文件??????12234??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\builder\_lxm
?????文件??????19777??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\builder\__init__.py
?????文件??????12476??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\builder\__pycache__\_html5lib.cpython-39.pyc
?????文件??????12968??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\builder\__pycache__\_htmlparser.cpython-39.pyc
?????文件???????9418??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\builder\__pycache__\_lxm
?????文件??????15293??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\builder\__pycache__\__init__.cpython-39.pyc
?????文件??????34130??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\dammit.py
?????文件???????7755??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\diagnose.py
?????文件??????81650??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\element.py
?????文件???????5654??2020-12-16?14:58??weibo\venv\Lib\site-packages\bs4\formatter.py
............此處省略1722個文件信息
評論
共有 條評論