python爬虫实战之多线程爬取前程无忧简历
生活随笔
收集整理的這篇文章主要介紹了
python爬虫实战之多线程爬取前程无忧简历
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
python爬蟲實戰(zhàn)之多線程爬取前程無憂簡歷
import requests import re import threading import time from queue import QueueHEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }# 自定義線程--生產者 class Procuder(threading.Thread):# 初始化,傳入url和保存數據的隊列def __init__(self, pageurl_queue, jobinfo_queue, *args, **kwargs):super(Procuder, self).__init__(*args, **kwargs)self.pageurl_queue = pageurl_queueself.jobinfo_queue = jobinfo_queue# 重寫run方法def run(self):while True:# 當url隊列為空時,退出循環(huán)if self.pageurl_queue.empty():break# 獲取隊列中的urlurl = self.pageurl_queue.get()# 調用解析url函數self.parse_page(url)# 解析url函數def parse_page(self, url):# 模擬請求,并以gbk編碼返回resp = requests.get(url, headers=HEADERS)resp.encoding = "gbk"text = resp.text# 通過正則爬取每條職位的urljobs_url = re.findall('<div class="el">.*?<a target="_blank".*?href="(.*?)".*?>', text, re.DOTALL)for x in jobs_url:# 調用解析每條職位的url函數self.parse_job_info(x)# 解析每條職位的url函數def parse_job_info(self, url):# 模擬請求,并以gbk編碼返回resp = requests.get(url, headers=HEADERS)resp.encoding = "gbk"text = resp.text# 通過正則爬取想要的信息info = re.findall(r'<p class="msg ltype" title="(.*?)"', text, re.DOTALL)if len(info) > 0:all_info = re.sub(" ", "", info[0])infos = all_info.split("|")if len(infos) >= 5 and infos[4].find("發(fā)布") >= 0:jobname = re.findall(r'<div class="cn">.*?>(.*?)<input', text, re.DOTALL)[0].split(' ', 1)[0]if jobname == "": jobname = "null"companyname = re.findall(r'<p class="cname">.*?title="(.*?)"', text, re.DOTALL)[0]if companyname == "": companyname = "null"companytype = re.findall(r'<div class="com_tag".*?title="(.*?)"', text, re.DOTALL)[0]if companytype == "": companytype = "null"companysize = re.findall(r'<div class="com_tag".*?</p>.*?title="(.*?)"', text, re.DOTALL)[0]if companysize == "": companysize = "null"companysalary = re.findall(r'<div class="cn">.*?<strong>(.*?)<', text, re.DOTALL)[0]if companysalary == "": companysalary = "null"companycity = infos[0]workingExp = infos[1]edulevel = infos[2]needperson = infos[3]createdata = infos[4]welfare = re.findall(r' <div class="t1">(.*?)<div', text, re.DOTALL)[0]if welfare == "" or welfare.isspace(): welfare = "null"welfare = re.sub("\n", "", welfare)welfare = re.sub(" ", "", welfare)welfare = re.sub("<.*?>", "/", welfare)welfare = re.sub("//", ",", welfare)welfare = re.sub("/", "", welfare)welfare = re.sub("\r", "", welfare)# 將爬取內容存儲到數據的隊列self.jobinfo_queue.put((jobname, companyname, companytype, companysize, companycity, companysalary,edulevel, workingExp, welfare, needperson, createdata))# 自定義線程--消費者 class Consumer(threading.Thread):# 初始化,傳入url和保存數據的隊列def __init__(self, pageurl_queue, jobinfo_queue, *args, **kwargs):super(Consumer, self).__init__(*args, **kwargs)self.pageurl_queue = pageurl_queueself.jobinfo_queue = jobinfo_queue# 重寫run方法def run(self):while True:# 當url隊列和存儲數據隊列為空時,退出循環(huán)if self.jobinfo_queue.empty() and self.pageurl_queue.empty():break# "jobname","companyname","companytype","companysize","companycity","companysalary","edulevel","workingExp","welfare","needperson","createdata"values = self.jobinfo_queue.get()# 追加寫入到文本中with open("qcwy.txt", "a+", encoding="utf-8", newline="") as f:f.write(values[0] + '\001' + values[1] + '\001' + values[2] + '\001' + values[3] + '\001' + values[4] + '\001' + values[5] + '\001' + values[6] + '\001' + values[7] + '\001' + values[8] + '\001' +values[9] + '\001' + values[10] + "\n")print("完成")# 根據url返回頁數 def return_pages(url):resp = requests.get(url, headers=HEADERS)resp.encoding = "gbk"text = resp.textpage = re.findall('<div class="rt">.*?</span> / (.*?)<', text, re.DOTALL)[0]return page.strip()def main():# 創(chuàng)建隊列pageurl_queue = Queue(200000)jobinfo_queue = Queue(200000)start_url = "https://search.51job.com/list/{},000000,0000,00,9,99,%2520,2,1.html"info_url = "https://search.51job.com/list/{},000000,0000,00,9,99,%2520,2,{}.html"#相關省份和程式碼city_code = ['010000', '020000', '030000', '050000', '060000', '070000','080000', '090000', '100000', '110000', '120000', '130000','140000', '150000', '160000', '170000', '180000', '190000','200000', '210000', '220000', '230000', '240000', '250000','260000', '270000', '280000', '290000', '300000', '310000','320000', '110200', '030200', '040000', '080200', '180200','200200', '070200', '090200', '030800', '230300', '230200','080300', '170200', '070300', '250200', '190200', '150200','120300', '120200', '220200', '240200']#獲取每條職位對應的url放入隊列中for x in city_code:for y in range(1, int(return_pages(start_url.format(x))) + 1):u = info_url.format(x, y)pageurl_queue.put(u)#循環(huán)開啟生產者線程 100=100線程for x in range(100):t = Procuder(pageurl_queue, jobinfo_queue)t.start()time.sleep(8)#循環(huán)開啟消費者線程 100=100線程for x in range(100):t = Consumer(pageurl_queue, jobinfo_queue)t.start()if __name__ == '__main__':main()總結
以上是生活随笔為你收集整理的python爬虫实战之多线程爬取前程无忧简历的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: VC6.0+ddk+DriverStud
- 下一篇: trim() 方法