python爬蟲爬微信公眾號文章

大小: 10KB

文件類型: .py

金幣: 1

下載: 0 次

發(fā)布日期: 2021-06-16
語言: Python
標簽: python??爬蟲??

高速下載

資源簡介

通過搜狗搜索中的微信搜索入口來爬取微信公眾號上的文章時間,文章標題,文章地址,文章簡介、圖片

資源截圖

小圖大圖

代碼片段和文件信息

#!/usr/bin/python
#?coding:?utf-8

‘‘‘
總的來說就是通過搜狗搜索中的微信搜索入口來爬取
2017-04-13?by?Jimy_fengqi
‘‘‘
import?urllib

import?xlwt?as?xlwt

from?urllib?import?parse
from?pyquery?import?PyQuery?as?pq
from?selenium?import?webdriver

import?requests
import?time
import?re
import?os


def?dirsIsExists（path）:
????if?os.path.exists（path）:
????????message?=?‘OK?the?“%s“?file?exists.‘
????else:
????????message?=?‘Sorry?I?cannot?find?the?“%s“?file.‘
????????os.makedirs（path）
????print（message）

????#爬蟲主函數(shù)
def?saveImgTwo（pathnameList）:
????dirsIsExists（path）
????Length?=?len（nameList）
????x?=?0
????for?i?in?range（0?Length）:
????????imgurl?=?nameList[i][“pic“]
????????print（“圖片%d:%s\n“%（i+1?imgurl））
????????if?‘http‘?in?imgurl:
????????????print?（“第?%s“?%x?+?“張圖片“）
????????????urllib.request.urlretrieve（imgurl?path+‘/‘+‘/%s.jpg‘?%?x）
????????????x?+=?1
????????????#判斷目錄是否存在不存在就生成

class?weixin_spider:

????def?__init__（self?keywords）:
????????‘?構造函數(shù)?‘
????????self.keywords?=?keywords
????????#?搜狐微信搜索鏈接入口
????????#self.sogou_search_url?=?‘http://weixin.sogou.com/weixin?type=1&query=%s&ie=utf8&_sug_=n&_sug_type_=‘?%?quote（self.keywords）
????????self.sogou_search_url?=?‘http://weixin.sogou.com/weixin?type=1&query=%s&ie=utf8&s_from=input&_sug_=n&_sug_type_=‘?%?parse.quote（self.keywords）

????????#?爬蟲偽裝頭部設置
????????self.headers?=?{‘User-Agent‘:?‘Mozilla/5.0?（Windows?NT?6.3;?WOW64;?rv:51.0）?Gecko/20100101?Firefox/51.0‘}

????????#?設置操作超時時長
????????self.timeout?=?5

????????#?爬蟲模擬在一個request.session中完成
????????self.s?=?requests.Session（）

????????#excel?第一行數(shù)據(jù)
????????self.excel_data=[u‘編號‘u‘時間‘u‘文章標題‘u‘文章地址‘u‘文章簡介‘]
????????#定義excel操作句柄
????????self.excle_w=xlwt.Workbook（）


????#搜索入口地址，以公眾為關鍵字搜索該公眾號
????def?get_search_result_by_keywords（self）:
????????self.log（u‘搜索地址為：%s‘?%?self.sogou_search_url）
????????return?self.s.get（self.sogou_search_url?headers=self.headers?timeout=self.timeout）.content

????#獲得公眾號主頁地址
????def?get_wx_url_by_sougou_search_html（self?sougou_search_html）:
????????doc?=?pq（sougou_search_html）
????????#print?doc（‘p[class=“tit“]‘）（‘a‘）.attr（‘href‘）
????????#print?doc（‘div[class=img-box]‘）（‘a‘）.attr（‘href‘）
????????#通過pyquery的方式處理網(wǎng)頁內容，類似用beautifulsoup，但是pyquery和jQuery的方法類似，找到公眾號主頁地址
????????return?doc（‘div[class=txt-box]‘）（‘p[class=tit]‘）（‘a‘）.attr（‘href‘）

????#使用webdriver?加載公眾號主頁內容，主要是js渲染的部分
????def?get_selenium_js_html（self?url）:
????????browser?=?webdriver.PhantomJS（executable_path=r‘D:\mytoolssoft\idea_Tool\phantomjs-2.1.1-windows\bin\phantomjs.exe‘）

????????browser.get（url）
????????time.sleep（3）
????????#?執(zhí)行js得到整個頁面內容
????????html?=?browser.execute_script（“return?document.documentElement.outerHTML“）
????????browser.close（）
????????return?html
????#獲取公眾號文章內容
????def?parse_wx_articles_by_html（self?selenium_html）:
????????doc?=?pq（selenium_html）
????????print?（u‘開始查找內容msg‘）
????????return?doc（‘div[class=“weui_media_box?appmsg“]‘）

????#有的公眾號僅僅有

上一篇：pycuda-2017.1.1+cuda9185-cp36-cp36m-win_amd64.whl
下一篇：Python_驗證采樣定理.py

xxxx18一60岁hd中国/日韩女同互慰一区二区/西西人体扒开双腿无遮挡/日韩欧美黄色一级片 - 色护士精品影院www

python爬蟲爬微信公眾號文章

資源簡介

資源截圖

代碼片段和文件信息

評論

相關資源