爬虫多线程生产者与消费者
生活随笔
收集整理的這篇文章主要介紹了
爬虫多线程生产者与消费者
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
#-*-coding:utf-8-*-
# -*-coding:utf-8-*-
import threading, time, requests, json
from queue import Queue
class Pcoduct(threading.Thread): # 繼承多線程父類def __init__(self, i, q):super().__init__() # 繼承父類init#自動執(zhí)行runself.i = iself.q = qdef run(self): # 請求#復寫run方法while True:if self.q.empty():breaktry:q2 = self.q.get(block=False)print(self.i, "任務執(zhí)行")time3 = time.time()url = f"https://careers.tencent.com/tencentcareer/api/post/Query?timestamp={time3}&keyword=python&pageIndex={q2}&pageSize=10&language=zh-cn&area=cn"self.get_html(url)print(self.i, "任務結束")except:passdef get_html(self, url):headers = { # 偽裝成瀏覽器,防止反爬,通用'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}response = requests.get(url=url, headers=headers).json()q.put(response)
class Customer(threading.Thread): # 繼承多線程父類def __init__( self,j):super().__init__() # 繼承父類init#自動執(zhí)行runself.j = jdef run(self): # 請求#復寫run方法while True:if c.empty() and flag:breaktry:response=c.get(block=False)print(self.j, "任務執(zhí)行")self.parse_html(response)print(self.j,'任務結束')except:passdef parse_html(self,response):job_list = response['Data']['Posts']for job in job_list:# 工作名稱:name = job['RecruitPostName']# 工作地點:address = job['LocationName']# 崗位職責:Responsibility = job['Responsibility']Responsibility = Responsibility.replace('\n', '').replace('\r', '')# 詳情url:PostURL = job['PostURL']infor = f'工作名稱:{name},工作地點:{address},崗位職責:{Responsibility},詳情url:{PostURL}'with lock:with open('騰訊招聘.txt', 'a', encoding='utf-8')as fp:fp.write(infor + '\n')
if __name__ == '__main__':lock=threading.Lock()flag=Falsestart = time.time()q = Queue()#生產者隊列for i in range(1, 21):q.put(i)c=Queue()product= ['p1', 'p2', 'p3'] # 創(chuàng)建任務對列#起線程customer=['c1','c2','c3']qp=[]cq=[]for pi in product:crawl = Pcoduct(pi, q)crawl.start()qp.append(crawl)for ci in customer:crawl2 = Customer(ci)crawl2.start()cq.append(crawl2)for pj in qp:pj.join() # 阻塞主線程flag=Truefor cj in cq:cj.join()end = time.time()print(end - start)
總結
以上是生活随笔為你收集整理的爬虫多线程生产者与消费者的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 无界面(webdriver.Phanto
- 下一篇: mongodb命令基础知识点