爬虫美女图片
import requests
import lxml.html
import re
import time
import os
import random
from django.views.decorators.csrf import csrf_exempt# user_agent列表,每次執行requests請求都隨機使用該列表中的user_agent,避免服務器反爬
user_agent_list = [# Windows / Firefox 58"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0",# Linux / Firefox 58"Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0",# Mac OS X / Safari 11.0.2"Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_2) AppleWebKit/603.1.13 (KHTML, like Gecko) Version/11.0.2 Safari/603.1.13",# Windows / IE 11"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",# Windows / Edge 16"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/16.16299.15.0",# Windows / Chrome 63"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",# Android Phone / Chrome 63"Mozilla/5.0 (Linux; Android 7.0; SM-G935P Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36",# Android Tablet / Chrome 63"Mozilla/5.0 (Linux; Android 4.4.4; Lenovo TAB 2 A10-70L Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Safari/537.36",# iPhone / Safari 11.1.1# "Mozilla/5.0 (iPhone; CPU iPhone OS 11_1_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/11.1.1 Mobile/14E304 Safari/602.1",# iPad / Safari 11.1.1"Mozilla/5.0 (iPad; CPU OS 11_1_1 like Mac OS X) AppleWebKit/603.3.3 (KHTML, like Gecko) Version/11.1.1 Mobile/14G5037b Safari/602.1"]requests_header = {"Host": "","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0","Accept": "","Accept-Language": "zh-CN,en-US;q=0.5","Accept-Encoding": "gzip, deflate, br","Referer": "","Connectionv": "keep-alive","Pragma": "no-cache","Cache-Control": "no-cache"
}PICTURE_PATH = "f:/meitulu"def download_page_html(url):phtml = Nonepage = Nonetry:requests_header["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"requests_header["Host"] = "www.meitulu.com"requests_header["Referer"] = url# 選擇一個隨機的User-Agentrequests_header["User-Agent"] = random.choice(user_agent_list)# print(requests_header["User-Agent"])# print(requests_header)page = requests.get(url=url, headers=requests_header,timeout=15) # 請求指定的頁面# print(page.encoding)if page.encoding == "ISO-8859-1":page.encoding = "utf-8" # "gb2312" # 轉換頁面的編碼為gb2312(避免中文亂碼)phtml = page.text # 提取請求結果中包含的html文本# print("requests success")# page.close() # 關閉requests請求except requests.exceptions.RequestException as e:print("requests error:", e)phtml = None# if page != None:# page.close()finally:if page != None:page.close()return phtml@csrf_exempt
def download_picture(url, page, dir):try:picdir = "{0}/{1}".format(PICTURE_PATH, dir) # 構造圖片保存路徑print(picdir)if os.path.exists(picdir) != True:os.makedirs(picdir) # 如果指定的文件夾不存在就遞歸創建pic_name = url.split("/")[-1] # 用圖片鏈接中最后一個/后面的部分作為保存的圖片名pic_full_name = "{0}/{1}".format(picdir, pic_name)# print("save picture to :", pic_full_name)requests_header["Accept"] = "image/webp,*/*"requests_header["Host"] = "mtl.ttsqgs.com"requests_header["Referer"] = pageresponse = requests.get(url, headers=requests_header, timeout=15) # 獲取的文本實際上是圖片的二進制文本imgdata = response.content # 將他拷貝到本地文件 w 寫 b 二進制 wb代表寫入二進制文本if len(imgdata) > (5*1024): # 只保存大于5k的圖片with open(pic_full_name, 'wb') as f:f.write(imgdata) # 把圖片數據寫入文件。with語句會自動關閉fprint("save picture to :", pic_full_name)else:print("picture size too small")response.close()except:print("download piccture {0} error".format(url))# 獲取所有需要爬取的頁面數def get_page_list_num(tree):page_all_num = 0page_list_num = 0try:# 使用xpath選擇器選擇html中指定的元素。page_all_num = tree.xpath('//div[@id="pages"]/a/text()')[0]print(page_all_num)page_all_num = str(page_all_num)# print(page_all_num)page_all_num = re.sub(r"\D", "", page_all_num) # 把非數字字符串替換為空page_all_num = int(page_all_num) # 轉化為整數print("max_page_number:", page_all_num)except:print("get page number error")page_all_num = 0finally:# 向上取整, 每頁60個圖集, 用總圖集數除以60求出頁面數page_list_num = page_all_num // (15*4) if(page_all_num % (15*4)) != 0:page_list_num += 1return page_list_num, page_all_numdef get_page_album_list(tree): # 獲取頁面中的圖片集列表(編號)page_album_list = []page_album_list = tree.xpath('//ul[@class="img"]/li/a/@href')for i in range(len(page_album_list)):page_album_list[i] = page_album_list[i].split("/")[-1] # 提取最后一個 / 之后的內容 "17748.html"page_album_list[i] = re.sub(r"\D", "", page_album_list[i]) # 把非數字字符串替換為空, 提取出數字部分# print(page_album_list)return page_album_listdef get_page_title_list(tree): # 獲取頁面中的圖片集標題page_title_list = []page_title_list = tree.xpath('//ul[@class="img"]/li/a/img/@alt')# print(page_title_list)return page_title_listdef get_page_jpgnum_list(tree): # 獲取頁面中的圖片數目列表page_jpgnum_list = []page_jpgnum_list = tree.xpath('//ul[@class="img"]/li/p[1]/text()')for i in range(len(page_jpgnum_list)):page_jpgnum_list[i] = re.sub(r"\D", "", page_jpgnum_list[i]) # 把非數字字符串替換為空, 提取出數字部分page_jpgnum_list[i] = int(page_jpgnum_list[i])# print(page_jpgnum_list)return page_jpgnum_listREQUEST_URL0 = "https://www.meitulu.com/t/siwayouhuo/"
REQUEST_URL1 = "https://www.meitulu.com/t/siwayouhuo/{0}.html"REQUEST_ALBUM_URL = "https://www.meitulu.com/item/{0}.html"
REQUEST_JPEG_URL = "https://mtl.ttsqgs.com/images/img/{0}/{1}.jpg"if __name__ == "__main__":requests_url = REQUEST_URL0index = 0page_list_num = 0page_all_num = 0print("requests_url :", requests_url)page_html_list = download_page_html(requests_url) # 下載當前頁面if(page_html_list == None):exit()# print(page_html_list)tree = lxml.html.fromstring(page_html_list)page_list_num, page_all_num = get_page_list_num(tree) # 獲取頁面數print(page_list_num, page_all_num)for idx in range(page_list_num):if(idx == 0):requests_url = REQUEST_URL0else:requests_url = REQUEST_URL1.format(idx+1)print(requests_url)page_html_list = download_page_html(requests_url) # 下載當前頁面if(page_html_list == None):continuetree = lxml.html.fromstring(page_html_list)page_album_list = get_page_album_list(tree)print(idx, len(page_album_list))page_title_list = get_page_title_list(tree)print(idx, len(page_title_list))page_jpgnum_list = get_page_jpgnum_list(tree)print(idx, len(page_jpgnum_list))if(len(page_album_list) != len(page_title_list)) or \(len(page_album_list) == 0) or (len(page_title_list) == 0) or \(len(page_jpgnum_list) == 0):continuefor lst in range(len(page_album_list)):for img in range(page_jpgnum_list[lst]):jpeg_url = REQUEST_JPEG_URL.format(page_album_list[lst], img+1)page_url = REQUEST_ALBUM_URL.format(page_album_list[lst])jpg_title = page_title_list[lst]# print(requests_url)print("Download [{0}] on [{1}], title[{2}]".format(jpeg_url, page_album_list[lst], jpg_title))download_picture(jpeg_url, page_url, jpg_title)web_sleep = random.randint(1, 5) # 延時一個隨機值,避免被服務器反爬# print("waiting {0} seconds".format(web_sleep))time.sleep(web_sleep)
總結
- 上一篇: 智能机器人比巴和智伴哪个好_比巴智能早教
- 下一篇: gardner环 matlab,High