拉钩网数据抓取
import json
import re
import timeimport requests
import multiprocessingclass HandleLaGou():def __init__(self):# 使用session保存cookies信息self.lagou_session = requests.Session()self.header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}self.city_list = ""# 獲取全國所有城市列表的方法def handle_city(self):city_search = re.compile(r'zhaopin/">(.*?)</a>')city_url = 'https://www.lagou.com/jobs/allCity.html'city_result = self.handle_request(method="GET", url=city_url)# 使用正則表達式獲取城市列表self.city_list = city_search.findall(city_result)self.lagou_session.cookies.clear() # 清除cookies# print(city_result)def handle_city_job(self, city):first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % cityfirst_response = self.handle_request(method="GET", url=first_request_url)total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')try:total_page = total_page_search.search(first_response).group(1)# 由于沒有崗位信息造成exceptionexcept:returnelse:for i in range(1, int(total_page) + 1):data = {"pn":i,"kd":"python"}page_url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false"% cityreferer_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s'%city#referer_url需要進行encodeself.header['Referer'] = referer_url.encode()response = self.handle_request("POST",page_url,data=data,info=city)lagou_data = json.loads(response)job_list = lagou_data['content']['positionResult']['result']for job in job_list:print(job)print(total_page)def handle_request(self, method, url, data=None, info=None):while True:#加入阿布云代理proxyinfo = "http://%s:%s@%s:%s"%('阿布云賬號','阿布云密碼','阿布云host','阿布云port')proxy = {"http":proxyinfo,"https":proxyinfo}try:if method == "GET":response = self.lagou_session.get(url=url, headers=self.header,proxies=proxy,timeout=6)elif method =="POST":response = self.lagou_session.post(url=url, headers=self.header,data=data,proxies=proxy,timeout=6)except:# 需先清除cookies信息,然后重新獲取
self.lagou_session.cookies.clear()first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % infoself.handle_request(method="GET", url=first_request_url)time.sleep(10)continueresponse.encoding = 'utf-8'if '頻繁' in response.text:print("頻繁")#需先清除cookies信息,然后重新獲取
self.lagou_session.cookies.clear()first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % infoself.handle_request(method="GET", url=first_request_url)time.sleep(10)continuereturn response.textif __name__ == '__main__':lagou = HandleLaGou()# 所有城市方法
lagou.handle_city()#引入多進程,加速抓取pool = multiprocessing.Pool(2)for city in lagou.city_list:pool.apply_async(lagou.handle_city_job,args=(city,))pool.close()pool.join()
?
轉載于:https://www.cnblogs.com/Erick-L/p/11348119.html
總結
- 上一篇: 关于MySQL 5.6 中文乱码的问题(
- 下一篇: Python中断多重循环的几种思路