爬虫 防盗链
聲明:以某某圖 為例,代碼僅供學習參考!
1、利用fiddler,訪問某某圖首頁進行header獲取 (獲取結果如下)
headers = {"Accept":"image/webp,image/apng,image/*,*/*;q=0.8",# "Accept-Encoding":"gzip, deflate", 本地查看時,會導致亂碼"Accept-Language":"zh-CN,zh;q=0.8","User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36","Connection":"keep-alive","Referer":"http://www.mzitu.com"}2、拼接headers備用
headall = [] for key, value in headers.items():item = (key, value)headall.append(item)3、獲取html內容
def openhtml():cjar = http.cookiejar.CookieJar()#127.0.0.1:8888 為fiddler 的代理地址 方便查看信息 找錯proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8888'})opener = urllib2.build_opener(proxy, urllib2.HTTPHandler, urllib2.HTTPCookieProcessor(cjar))opener.addheaders = headallurllib2.install_opener(opener)data = urllib2.urlopen(url).read()return data4、利用正則表達式獲取所有圖片鏈接并保存到本地
def download(data):#正則匹配urlreg = "data-original='.*?\.jpg"imgre = re.compile(reg)imglist = re.findall(imgre, data)x = 0for image_url in imglist:image_url = image_url.replace("data-original='", "")print image_urlopener = urllib2.build_opener()#反 防盜鏈 精髓在此opener.addheaders = headalldata = opener.open(image_url).read()with open("C:\Users\zzz\Desktop\images\\" + str(x) + ".jpg", "wb") as code:code.write(data)x += 15、完整代碼
#coding=utf8 import urllib2 import http.cookiejar import reurl = "http://www.mzitu.com/xinggan" headers = {"Accept":"image/webp,image/apng,image/*,*/*;q=0.8",# "Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.8","User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36","Connection":"keep-alive","Referer":"http://www.mzitu.com"}headall = [] for key, value in headers.items():item = (key, value)headall.append(item)#獲取html def openhtml():cjar = http.cookiejar.CookieJar()#127.0.0.1:8888 為fiddler 的代理地址 方便查看信息 找錯proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8888'})opener = urllib2.build_opener(proxy, urllib2.HTTPHandler, urllib2.HTTPCookieProcessor(cjar))opener.addheaders = headallurllib2.install_opener(opener)data = urllib2.urlopen(url).read()return data#下載 def download(data):#正則匹配urlreg = "data-original='.*?\.jpg"imgre = re.compile(reg)imglist = re.findall(imgre, data)x = 0for image_url in imglist:image_url = image_url.replace("data-original='", "")print image_urlopener = urllib2.build_opener()#反 防盜鏈 精髓在此opener.addheaders = headalldata = opener.open(image_url).read()with open("C:\Users\zzz\Desktop\images\\" + str(x) + ".jpg", "wb") as code:code.write(data)x += 1if __name__ == '__main__':data = openhtml()download(data)?
轉載于:https://www.cnblogs.com/z-z-z/p/7755763.html
總結
- 上一篇: [.net]webform 版本冲突
- 下一篇: mysql 查询条件