用requests爬取一个招聘网站
生活随笔
收集整理的這篇文章主要介紹了
用requests爬取一个招聘网站
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
import requests
import re
session = requests.session()
第一步:訪問登陸頁,拿到X_Anti_Forge_Token,X_Anti_Forge_Code # 1、請求url:https://passport.lagou.com/login/login.html
# 2、請求方法:GET 因為是get請求不需要請求體
# 3、請求頭:User-agent 代碼如下: r1 = session.get('https://passport.lagou.com/login/login.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',},)X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] #正則表達式獲取的值是一個列表 X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# 2、請求方法:POST
# 3、請求頭:包含:cookie,User-agent,Referer,X-Anit-Forge-Code,X-Anit-Forge-Token
# 4、請求體包含如下: # isValidate:true
# username:18611453110
# password:70621c64832c4d4d66a47be6150b4a8e
# request_form_verifyCode:''
# submit:''
代碼如下: r2 = session.post('https://passport.lagou.com/login/login.json',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': 'https://passport.lagou.com/login/login.html','X-Anit-Forge-Code': X_Anti_Forge_Code,'X-Anit-Forge-Token': X_Anti_Forge_Token,'X-Requested-With': 'XMLHttpRequest'},data={"isValidate": True,'username': '18611453110',#這是登陸的用戶名,'password': '70621c64832c4d4d66a47be6150b4a8e',#這是加密的密碼'request_form_verifyCode': '','submit': ''})
# 2、請求方法:GET
# 3、請求頭:包含:User-agent,Referer r3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': 'https://passport.lagou.com/login/login.html',}) 第四步:驗證是登陸成功: r4 = session.get('https://www.lagou.com/resume/myresume.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',})
# print('18611453110' in r4.text)#驗證是否登陸成功 ?
# 請求方法:GET
# 請求頭:
# User-Agent
# 請求參數:
# gj:3年及以下
# px:default
# yx:25k-50k
# city:北京 from urllib.parse import urlencoderes = urlencode({'k': 'java高級開發'}, encoding='utf-8').split('=')[-1] url = 'https://www.lagou.com/jobs/list_' + res # r5 = session.get(url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', # }, # params={ # 'gj': '3年及以下', # 'px': 'default', # 'yx': '25k-50k', # 'city': '北京' # } # ) # # print(r5.text)
#請求方法:POST
#請求頭
# Referer
# User-Agent
#請求體:
# first:true
# pn:1
# kd:java高級開發
#請求參數
# params={
# 'gj': '3年及以下',
# 'px': 'default',
# 'yx': '25k-50k',
# 'city': '北京',
# 'needAddtionalResult':False,
# 'isSchoolJob':0
# } r6=session.post('https://www.lagou.com/jobs/positionAjax.json',headers={'Referer':url,'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',},data={'first':True,'pn':1,'kd':'java高級開發'},params={'gj': '3年及以下','px': 'default','yx': '25k-50k','city': '北京','needAddtionalResult': False,'isSchoolJob': 0}) comapines_list=r6.json()['content']['positionResult']['result'] for comapiny in comapines_list:positionId=comapiny['positionId']company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)companyShortName = comapiny['companyShortName']positionName = comapiny['positionName']salary = comapiny['salary']print('''詳情連接:%s公司名:%s職位名:%s薪資:%s''' %(company_link,companyShortName,positionName,salary)) #第七步:訪問詳情頁,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
# 請求url:詳情頁地址
# 請求方式:GET
# 請求頭:User-Agent r7=session.get(company_link,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',})X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] #第八步:投遞簡歷
#請求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
#請求方式:POST
#請求頭:
#Referer:詳情頁地址
#User-agent
#X-Anit-Forge-Code:53165984
#X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
#X-Requested-With:XMLHttpRequest
#請求體:
# positionId:職位ID
# type:1
# force:true session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': company_link,'X-Anit-Forge-Code': X_Anti_Forge_Code,'X-Anit-Forge-Token': X_Anti_Forge_Token,'X-Requested-With': 'XMLHttpRequest'},data={'positionId':positionId,'type':1,'force':True})print('%s 投遞成功' %(companyShortName))
第六步找到一個公司,進入詳情頁,然后投遞簡歷。
import re
session = requests.session()
第一步:訪問登陸頁,拿到X_Anti_Forge_Token,X_Anti_Forge_Code # 1、請求url:https://passport.lagou.com/login/login.html
# 2、請求方法:GET 因為是get請求不需要請求體
# 3、請求頭:User-agent 代碼如下: r1 = session.get('https://passport.lagou.com/login/login.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',},)X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] #正則表達式獲取的值是一個列表 X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
第二步:登陸
# 1、請求url:https://passport.lagou.com/login/login.json# 2、請求方法:POST
# 3、請求頭:包含:cookie,User-agent,Referer,X-Anit-Forge-Code,X-Anit-Forge-Token
# 4、請求體包含如下: # isValidate:true
# username:18611453110
# password:70621c64832c4d4d66a47be6150b4a8e
# request_form_verifyCode:''
# submit:''
代碼如下: r2 = session.post('https://passport.lagou.com/login/login.json',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': 'https://passport.lagou.com/login/login.html','X-Anit-Forge-Code': X_Anti_Forge_Code,'X-Anit-Forge-Token': X_Anti_Forge_Token,'X-Requested-With': 'XMLHttpRequest'},data={"isValidate": True,'username': '18611453110',#這是登陸的用戶名,'password': '70621c64832c4d4d66a47be6150b4a8e',#這是加密的密碼'request_form_verifyCode': '','submit': ''})
第三步:授權
1、請求url:https://passport.lagou.com/grantServiceTicket/grant.html# 2、請求方法:GET
# 3、請求頭:包含:User-agent,Referer r3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': 'https://passport.lagou.com/login/login.html',}) 第四步:驗證是登陸成功: r4 = session.get('https://www.lagou.com/resume/myresume.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',})
# print('18611453110' in r4.text)#驗證是否登陸成功 ?
第五步:篩選職位信息
# 請求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91# 請求方法:GET
# 請求頭:
# User-Agent
# 請求參數:
# gj:3年及以下
# px:default
# yx:25k-50k
# city:北京 from urllib.parse import urlencoderes = urlencode({'k': 'java高級開發'}, encoding='utf-8').split('=')[-1] url = 'https://www.lagou.com/jobs/list_' + res # r5 = session.get(url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', # }, # params={ # 'gj': '3年及以下', # 'px': 'default', # 'yx': '25k-50k', # 'city': '北京' # } # ) # # print(r5.text)
? 沒有取到數據,因為數據是通過ajax發送的,所以我們換另一種方法解決:
#請求url:https://www.lagou.com/jobs/positionAjax.json#請求方法:POST
#請求頭
# Referer
# User-Agent
#請求體:
# first:true
# pn:1
# kd:java高級開發
#請求參數
# params={
# 'gj': '3年及以下',
# 'px': 'default',
# 'yx': '25k-50k',
# 'city': '北京',
# 'needAddtionalResult':False,
# 'isSchoolJob':0
# } r6=session.post('https://www.lagou.com/jobs/positionAjax.json',headers={'Referer':url,'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',},data={'first':True,'pn':1,'kd':'java高級開發'},params={'gj': '3年及以下','px': 'default','yx': '25k-50k','city': '北京','needAddtionalResult': False,'isSchoolJob': 0}) comapines_list=r6.json()['content']['positionResult']['result'] for comapiny in comapines_list:positionId=comapiny['positionId']company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)companyShortName = comapiny['companyShortName']positionName = comapiny['positionName']salary = comapiny['salary']print('''詳情連接:%s公司名:%s職位名:%s薪資:%s''' %(company_link,companyShortName,positionName,salary)) #第七步:訪問詳情頁,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
# 請求url:詳情頁地址
# 請求方式:GET
# 請求頭:User-Agent r7=session.get(company_link,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',})X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] #第八步:投遞簡歷
#請求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
#請求方式:POST
#請求頭:
#Referer:詳情頁地址
#User-agent
#X-Anit-Forge-Code:53165984
#X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
#X-Requested-With:XMLHttpRequest
#請求體:
# positionId:職位ID
# type:1
# force:true session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': company_link,'X-Anit-Forge-Code': X_Anti_Forge_Code,'X-Anit-Forge-Token': X_Anti_Forge_Token,'X-Requested-With': 'XMLHttpRequest'},data={'positionId':positionId,'type':1,'force':True})print('%s 投遞成功' %(companyShortName))
?
第7步,8步是并列的,放在第六步的里面。第六步找到一個公司,進入詳情頁,然后投遞簡歷。
?
轉載于:https://www.cnblogs.com/1a2a/p/8305165.html
總結
以上是生活随笔為你收集整理的用requests爬取一个招聘网站的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: postman 变量
- 下一篇: Node.js 常用Mongoose方法