Python笔记-多线程爬虫实例
生活随笔
收集整理的這篇文章主要介紹了
Python笔记-多线程爬虫实例
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
如下,線程池兩個線程:
線程池關鍵代碼:
源碼如下:
import re, multiprocessing import requests, timeclass HandleLaGou(object):def __init__(self):self.laGou_session = requests.session()self.header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}self.city_list = ""#獲取全國城市列表def handle_city(self):city_search = re.compile(r'zhaopin/">(.*?)</a>')city_url = "https://www.lagou.com/jobs/allCity.html"city_result = self.handle_request(method = "GET", url = city_url)self.city_list = city_search.findall(city_result)self.laGou_session.cookies.clear()def handle_city_job(self, city):first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % cityfirst_response = self.handle_request(method = "GET", url = first_request_url)total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')try:total_page = total_page_search.search(first_response).group(1)except:returnelse:for i in range(1, int(total_page) + 1):data = {"pn": i,"kd": "python"}page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % cityreferer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % cityself.header['Referer'] = referer_url.encode()response = self.handle_request(method = "POST", url = page_url, data = data, info = city)print(response)def handle_request(self, method, url, data = None, info = None):while True:proxyinfo = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {"host" : "http-dyn.abuyun.com","port" : 9020,"user" : "V21C9SWA4CQ3FSHD","pass" : "1DF3191F6103Q34",}proxy = {"http": proxyinfo,"https": proxyinfo}try:if method == "GET":response = self.laGou_session.get(url=url, headers=self.header, proxies=proxy,timeout=6)return response.textelif method == "POST":response = self.laGou_session.post(url=url, headers=self.header, data=data, proxies=proxy,timeout=6)print(response.text)except:self.laGou_session.cookies.clear()first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % infoself.handle_request(method="GET", url=first_request_url)time.sleep(10)continueresponse.encoding = 'utf-8'if '頻繁' in response.text:# 先清除cookies再重新獲取cookiesself.laGou_session.cookies.clear()first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % infoself.handle_request(method="GET", url=first_request_url)time.sleep(10)continuereturn response.textif __name__ == '__main__':laGou = HandleLaGou()laGou.handle_city()#多進程爬網(wǎng)站pool = multiprocessing.Pool(2)for city in laGou.city_list:pool.apply_async(laGou.handle_city_job, args=(city,))pool.close()pool.join()pass?
總結(jié)
以上是生活随笔為你收集整理的Python笔记-多线程爬虫实例的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Leaflef笔记-使用leaflet-
- 下一篇: QML笔记-JavaScript在QML