python爬虫爬美女图片:“小甲鱼056节轮一只爬虫的自我修养4:OOXX” 最终更新2021.6.27日,更新后可用!!!
目錄
2020.10.25日更新
代碼:
2020.11.10日更新
代碼
2021.6.27日更新,更新后可用
2020.10.25日更新
需要改動有兩個方面:
第一方面,網址編碼有變化,如http://jandan.net/ooxx/MjAyMDEwMjUtODY=#comments,尾部的為base64編碼,在網址構造時對應處使用base64編碼.即可。
第二方面,圖片地址發生了小改變,只需要將圖片地址加入列表時,增加? ?http:?即可。
代碼:
import urllib.request import os import base64def url_open(url):opener = urllib.request.build_opener()# 向opener傳入請求頭信息opener.addheaders = ([('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')])# 將創建好的opener對象裝入requesturllib.request.install_opener(opener)req = urllib.request.Request(url)response = urllib.request.urlopen(url)html = response.read()return htmldef get_page(url):html = url_open(url).decode('utf-8')a = html.find('current-comment-page') + 23b = html.find(']', a)return html[a:b]def find_imgs(url):html = url_open(url).decode('utf-8')img_addrs = []a = html.find('img src=')while a != -1:b = html.find('.jpg', a, a + 255)if b != -1:temp = html[a + 9:b + 4]address = 'http:' + tempimg_addrs.append(address )else:b = a + 9a = html.find('img src=', b)return img_addrsdef save_imgs(folder, img_addrs):for each in img_addrs:filename = each.split('/')[-1]with open(filename, 'wb') as f:img = url_open(each)f.write(img)def download_mm(folder='ooxx', pages=10):os.mkdir(folder)os.chdir(folder)url = 'https://jandan.net/ooxx/'page_num = int(get_page(url))for i in range(pages):page_num -= is_page = '20201025-' + str(page_num)base_page = base64.b64encode(s_page.encode('utf-8'))str_page = str(base_page, 'utf-8')page_url = url + str_page + '#comments'img_addrs = find_imgs(page_url)save_imgs(folder, img_addrs)if __name__ == '__main__':download_mm()? ? ?值得注意的是,照騙的網址http://jandan.net/ooxx/MjAyMDEwMjUtODY=#comments后邊的base64碼解碼后為 20201025-86,也就是今天的日期+‘-’+照騙頁碼。猜測網址會隨著日期而變化,如果成立,使用時可以將代碼段的圖示位置按照圖中提示改一下即可。也可以編寫一個日期讀取代碼,動態調整該位置的代碼。
測試圖如下
2020.11.10日更新
上述假設成立,代碼更新,自動讀取當前日期,因為圖片地址隨著日期變化。將此處代碼升級:
def get_time():now_time=datetime.datetime.now().strftime('%Y-%m-%d')list_time = list(now_time)list_time.pop(4)list_time.pop(6)time_now = ''.join(list_time)return time_now time_now = get_time()注意導入datetime包。
代碼
import urllib.request import os import base64 import datetimedef url_open(url):opener = urllib.request.build_opener()# 向opener傳入請求頭信息opener.addheaders = ([('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')])# 將創建好的opener對象裝入requesturllib.request.install_opener(opener)req = urllib.request.Request(url)response = urllib.request.urlopen(url)html = response.read()return htmldef get_page(url):html = url_open(url).decode('utf-8')a = html.find('current-comment-page') + 23b = html.find(']', a)return html[a:b]def get_time():now_time=datetime.datetime.now().strftime('%Y-%m-%d')list_time = list(now_time)list_time.pop(4)list_time.pop(6)time_now = ''.join(list_time)return time_nowtime_now = get_time()def find_imgs(url):html = url_open(url).decode('utf-8')img_addrs = []a = html.find('img src=')while a != -1:b = html.find('.jpg', a, a + 255)if b != -1:temp = html[a + 9:b + 4]address = 'http:' + tempimg_addrs.append(address )else:b = a + 9a = html.find('img src=', b)return img_addrsdef save_imgs(folder, img_addrs):for each in img_addrs:filename = each.split('/')[-1]with open(filename, 'wb') as f:img = url_open(each)f.write(img)def download_mm(folder='ooxx', pages=10):os.mkdir(folder)os.chdir(folder)url = 'https://jandan.net/ooxx/'page_num = int(get_page(url))time_now = get_time()for i in range(pages):page_num -= is_page = time_now + str(page_num)base_page = base64.b64encode(s_page.encode('utf-8'))str_page = str(base_page, 'utf-8')page_url = url + str_page + '#comments'img_addrs = find_imgs(page_url)save_imgs(folder, img_addrs)if __name__ == '__main__':download_mm()2021.6.27日更新,更新后可用
近日看了些爬蟲,更新一下。
添加注釋,修改bug, 網站網址子目錄有所更改xxoo=>girl,變為http://jandan.net/girl/,。當前時間該目錄下共計1855張妹子圖片。 enjoy yourself!
import urllib.request import os import base64 import datetimedef url_open(url):"""打開網址,并返回網址內容:param url: 輸入含妹子的網頁:return: html:所輸入的網頁的內容/源碼"""opener = urllib.request.build_opener()# 向opener傳入請求頭信息opener.addheaders = ([('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')])# 將創建好的opener對象裝入requesturllib.request.install_opener(opener)req = urllib.request.Request(url)response = urllib.request.urlopen(url)html = response.read()return htmldef get_page(url):"""獲取妹子網站最新頁碼數目,便于遍歷:param url::return: 返回最新更新的妹子的頁面數字"""html = url_open(url).decode('utf-8')a = html.find('current-comment-page') + 23b = html.find(']', a)#print(html[a:b])return html[a:b]def get_time():"""獲取目標格式的當前時間:return: 返回時間,格式如 20210627"""now_time = datetime.datetime.now().strftime('%Y-%m-%d')list_time = list(now_time)list_time.pop(4)list_time.pop(6)time_now = ''.join(list_time)return time_nowdef find_imgs(url):"""查找url內的圖片地址:param url: 待查找url:return: 返回所查找url內的圖片地址"""html = url_open(url).decode('utf-8')img_addrs = []a = html.find('img src=')while a != -1:b = html.find('.jpg', a, a + 255)if b != -1:temp = html[a + 9:b + 4]address = 'http:' + tempimg_addrs.append(address)else:b = a + 9a = html.find('img src=', b)return img_addrsdef save_imgs(folder, img_addrs):"""下載保存圖片:param folder: 保存圖片的文件夾名字:param img_addrs: 圖片地址:return: 無"""print("此頁面共計" + str(len(img_addrs)) +"張妹子圖片...\n")i = len(img_addrs)for each in img_addrs:i = i + 1filename = each.split('/')[-1]with open(filename, 'wb') as f:img = url_open(each)f.write(img)print("此頁面的" + str(len(img_addrs)) +"張妹子圖片下載成功...\n")def download_mm(folder='ooxx'):"""主函數:param folder: 保存圖片的文件夾名字:param pages::return: 無"""path = os.path.join(os.getcwd(), folder)if not os.path.exists(path):os.mkdir(path) # 如果不存在這個pro文件夾,就自動創建一個else:print("文件夾已經存在...\n")os.chdir(folder)url = 'https://jandan.net/girl/' ##2021.6.27妹子網址被更改print("正在查找含有妹子的網頁...\n")page_num = int(get_page(url)) #含有妹子網頁的最大頁碼數目print("查找成功,總共含有妹子的頁面數目為: "+str(page_num))time_now = get_time()#print(time_now)num_all = 0 #統計已經下載的圖片數目for i in range(1,page_num):print("=====================================================")print("正在對頁面" + str(i) + "進行操作...\n" )s_page = time_now + '-' + str(i)base_page = base64.b64encode(s_page.encode('utf-8'))str_page = str(base_page, 'utf-8')page_url = url + str_page + '#comments'#print(page_url)#print(page_url)print("正在查找妹子圖片地址...\n")img_addrs = find_imgs(page_url)print("正在下載頁面"+ str(i) + "內的妹子圖片...\n")num_all = num_all + len(img_addrs)save_imgs(folder, img_addrs)print("已經下載" + str(num_all) + "張圖片\n")if __name__ == '__main__':#修改下面待存儲的文件夾名字,默認為 ooxxfolder = 'ooxx'download_mm(folder)print("下載結束\n")下載完成共計1855張圖片,如下圖
下一步:
計劃更改為使用scrapy框架。
總結
以上是生活随笔為你收集整理的python爬虫爬美女图片:“小甲鱼056节轮一只爬虫的自我修养4:OOXX” 最终更新2021.6.27日,更新后可用!!!的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: TED听后笔记:如何理解并克服拖延症
- 下一篇: 环形复杂度,McCabe方法是什么?