模拟知乎登陆(requests和scrapy)
生活随笔
收集整理的這篇文章主要介紹了
模拟知乎登陆(requests和scrapy)
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
1.?request
登錄知乎需要向服務(wù)器提交的信息有:
? ①headers
? ②_xsrf
? ③captcha
需要通過解析頁面獲得_xsrf和captcha(驗(yàn)證碼)
而有關(guān)captcha的獲取則必須要用session的方式獲得,?目的是為了使_xsrf和驗(yàn)證碼信息一致
(因?yàn)閟ession中可以保存cookie,?保證數(shù)據(jù)的一致性)代碼如下:
1 import re 2 import time 3 import os.path 4 import requests 5 6 try: 7 import cookielib 8 except: 9 import http.cookiejar as cookielib 10 11 from PIL import Image 12 13 session = requests.session() 14 session.cookies = cookielib.LWPCookieJar(filename="cookies")# 登陸成功后將cookie保存到文件中, 之后登陸就可以直接加載cookie,而不需要輸入賬號(hào)和密碼(session機(jī)制) 15 try: 16 session.cookies.load(ignore_discard=True) 17 except: 18 print("cookies未能加載") 19 20 agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0' 21 # agent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36' 22 23 # agent = "Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/57.0" 24 headers = { 25 "Host": "www.zhihu.com", 26 "Referer": "https://www.zhihu.com/", 27 "User-Agent": agent, 28 } 29 30 31 def get_xsrf(): 32 response = session.get("https://www.zhihu.com/", headers= headers) 33 match_ojb = re.search('name="_xsrf" value="(.*)"', response.text) 34 print(response.text) 35 if match_ojb: 36 return match_ojb.group(1) 37 else: 38 print("error") 39 40 41 def get_captcha(): 42 t = str(int(time.time() * 1000)) 43 captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login" 44 r = session.get(captcha_url, headers=headers) 45 with open('captcha.jpg', 'wb') as f: 46 f.write(r.content) 47 f.close() 48 try: 49 im = Image.open('captcha.jpg') 50 im.show() 51 im.close() 52 except: 53 print('請(qǐng)到 % s找到captcha.jpg手動(dòng)輸入'.format(os.path.abspath('captcha.jpg'))) 54 captcha = input("please input the captcha\n") 55 return captcha 56 57 58 def is_login(): 59 # 通過用戶個(gè)人中心驗(yàn)證是否登陸成功 60 check_url = "https://www.zhihu.com/settings/profile" 61 response = session.get(check_url, headers=headers, allow_redirects=False) 62 if response.status_code != 200: 63 return False 64 else: 65 return True 66 67 68 def login(account, password): 69 # 知乎登陸 70 _xsrf = get_xsrf() 71 if '@' in account: 72 print("郵箱登陸") 73 post_url = "https://www.zhihu.com/login/email" 74 post_data = { 75 "_xsrf": _xsrf, 76 "password": password, 77 "email": account, 78 } 79 else: 80 if re.match('^1\d{10}', account): 81 print("手機(jī)登陸") 82 post_url = "https://www.zhihu.com/login/phone_num" 83 post_data = { 84 "_xsrf": get_xsrf(), 85 "password": password, 86 "phone_num": account, 87 } 88 # 不需要驗(yàn)證碼直接登錄成功 89 response = session.post(post_url, data=post_data, headers=header) 90 login_code = response.json() 91 92 if login_code['r'] == 1: 93 print("不輸入驗(yàn)證碼登陸失敗") 94 #當(dāng)不輸入驗(yàn)證碼登錄失敗時(shí), 獲取驗(yàn)證碼, 重新登錄 95 post_data["captcha"] = get_captcha() 96 response = session.post(post_url, data=post_data, headers=header) 97 login_code = response.json() 98 print(login_code['msg']) 99 100 session.cookies.save() 101 102 if __name__ == '__main__': 103 if is_login(): 104 print("已經(jīng)登陸!") 105 else: 106 login(account, password)2.?scrapy
如果在scrapy中直接調(diào)用上文中的get_captcha()函數(shù)來獲得驗(yàn)證碼,?然后提交是無法登陸成功的,?原因是數(shù)據(jù)不一致,也就是說獲取的_xsrf和驗(yàn)證碼一起提交到服務(wù)器是不匹配的.
scrapy機(jī)制是默認(rèn)保存cookie的,所以可以通過兩個(gè)request請(qǐng)求來將得到的信息保存在默認(rèn)的cookie中,代碼如下:
1 # -*- coding: utf-8 -*- 2 import re 3 import json 4 import datetime 5 6 try: 7 import urlparse as parse 8 except: 9 from urllib import parse 10 11 import scrapy 12 13 14 class ZhihuSpider(scrapy.Spider): 15 name = "zhihu" 16 allowed_domains = ["www.zhihu.com"] 17 start_urls = ['https://www.zhihu.com/'] 18 19 headers = { 20 "HOST": "www.zhihu.com", 21 "Referer": "https://www.zhizhu.com", 22 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" 23 } 24 25 def start_requests(self): 26 return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)] 27 #獲取_xsrf 28 def login(self, response): 29 response_text = response.text 30 match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) 31 xsrf = '' 32 if match_obj: 33 xsrf = (match_obj.group(1)) 34 35 if xsrf: 36 post_url = "https://www.zhihu.com/login/phone_num" 37 post_data = { 38 "_xsrf": xsrf, 39 "phone_num": "", 40 "password": "", 41 "captcha": "" 42 } 43 44 import time 45 t = str(int(time.time() * 1000)) 46 captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t) 47 yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data":post_data}, callback=self.login_after_captcha) 48 49 #獲取驗(yàn)證碼 50 def login_after_captcha(self, response): 51 with open("captcha.jpg", "wb") as f: 52 f.write(response.body) 53 f.close() 54 55 from PIL import Image 56 try: 57 im = Image.open('captcha.jpg') 58 im.show() 59 im.close() 60 except: 61 pass 62 63 captcha = input("輸入驗(yàn)證碼\n>") 64 65 post_data = response.meta.get("post_data", {}) 66 post_url = "https://www.zhihu.com/login/phone_num" 67 post_data["captcha"] = captcha 68 return [scrapy.FormRequest( 69 url=post_url, 70 formdata=post_data, 71 headers=self.headers, 72 callback=self.check_login 73 )] 74 75 def check_login(self, response): 76 #驗(yàn)證服務(wù)器的返回?cái)?shù)據(jù)判斷是否成功 77 text_json = json.loads(response.text) 78 if "msg" in text_json and text_json["msg"] == "登錄成功": 79 for url in self.start_urls: 80 yield scrapy.Request(url, dont_filter=True, headers=self.headers)
?
轉(zhuǎn)載于:https://www.cnblogs.com/fenglj/p/7891500.html
總結(jié)
以上是生活随笔為你收集整理的模拟知乎登陆(requests和scrapy)的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: java 编程思想笔记(七)——异常
- 下一篇: RobotStudio Smart组