生活随笔
收集整理的這篇文章主要介紹了
python爬虫:做一个界面爬虫小软件
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
任務目標:
1.抓取不同類型的圖片
2.編寫一個GUI界面爬蟲程序,打包成exe重新文件
3.遇到的難點
1.分析如何抓取不同類型的圖片
首先打開網站,可以看到有如下6個類型的菜單
點擊不同菜單,發現URL顯示如下
大胸妹:https://www.buxiuse.com/?cid = 2
小翹臀:https://www.buxiuse.com/?cid = 6
可以看到每個類型圖片對應不同的cid值
所以要想抓取不同類型的圖片,只需要構造下url
將cid進行參數化,然后傳給url即可
具體代碼在以下定義
2.利用tkinter進行GUI編程
之前有寫過一些tkinter編程的隨筆
例如 利用python制作一個翻譯工具
先來看一下這次設計的程序最終頁面布局,
然后再具體講下如何實現的,頁面布局如下:
1.選擇圖片存儲路徑
抓取到的圖片要保存到電腦本地,所以就想著最好能夠自己挑選本地任意一個文件夾作為存儲路徑
后來網上沖浪一番發現tkinter是可以實現這個功能的
可以通過tkinter.filedialog模塊中的askdirectory()方法實現
下面是在網上找到的一段示例代碼
'''
遇到問題沒人解答?小編創建了一個Python學習交流QQ群:857662006 尋找有志同道合的小伙伴,
互幫互助,群里還有不錯的視頻學習教程和PDF電子書!
'''
from tkinter
import *
from tkinter
.filedialog
import askdirectory
def select_path():path_
= askdirectory
()path
.set(path_
)
root
= Tk
()
path
= StringVar
()
Label
(root
,text
= "目標路徑:").grid
(row
= 0, column
= 0)
Entry
(root
, textvariable
= path
).grid
(row
= 0, column
= 1)
Button
(root
, text
= "路徑選擇", command
= select_path
).grid
(row
= 0, column
= 2)
root
.mainloop
()
效果如下
具體到這個例子,
(1)定義一個文本框,用來存放(顯示)選擇的存儲路徑
self
.input = tk
.Entry
(self
.window
, textvariable
= self
.path
, width
=80)
(2)定一個按鈕,來觸發選擇本地路徑功能
self
.t_button
= tk
.Button
(self
.window
, text
='選擇路徑', relief
=tk
.RAISED
, width
=8, height
=1, command
=self
.select_Path
)
(3)定義一個函數,來實現選取路徑功能
'''
遇到問題沒人解答?小編創建了一個Python學習交流QQ群:857662006 尋找有志同道合的小伙伴,
互幫互助,群里還有不錯的視頻學習教程和PDF電子書!
'''
def select_Path(self
):"""選取本地路徑"""path_
= askdirectory
()self
.path
.set(path_
)
后續保存圖片時,路徑可以直接使用前面定義好的的self.input中的值
2.選擇分類
因為圖片分為了6個類別,每個類別對應一個cid值,所以可以預先把cid抽象出來,視為參數傳遞
(1)定義一個拖動框,存儲圖片類型
self
.menu
['value'] = ('大胸妹','小翹臀', '黑絲襪', '美腿控', '有顏值','大雜燴')
(2)根據選擇性類型不同,返回不同的cid值
def get_cid(self
):category
= {'DX': 2,'XQT': 6,'HSW': 7,'MTK': 3,'YYZ': 4,'DZH': 5}cid
= Noneif self
.menu
.get
() == "大胸妹":cid
= category
["DX"]elif self
.menu
.get
() == "小翹臀":cid
= category
["XQT"]elif self
.menu
.get
() == "黑絲襪":cid
= category
["HSW"]elif self
.menu
.get
() == "美腿控":cid
= category
["MTK"]elif self
.menu
.get
() == "有顏值":cid
= category
["YYZ"]elif self
.menu
.get
() == "大雜燴":cid
= category
["DZH"]return cid
3.填寫爬取頁數
自定義抓取深度,某些抓取前5頁或者前10頁
self
.page
= tk
.Entry
(self
.window
, width
=5)
后面把這個文本文本的值傳給url即可
3.遇到的問題
下載圖片的名稱無效,導致無法保存
有些圖片沒有名稱,文件名就是.jpg,
這樣在保存時會提示非法字符無法保存,程序也會報錯,終止運行。
為了解決這個問題,我在每個文件名的末尾都加一個字母,這樣就不會存在無名稱圖片了
file_path
= save_path
+ '/' + t
[0] + 'q' + '.jpg'
q就是我添加的字符,每個圖片都會有一個字母
整體效果如下
最后附上完整代碼:
'''
遇到問題沒人解答?小編創建了一個Python學習交流QQ群:857662006 尋找有志同道合的小伙伴,
互幫互助,群里還有不錯的視頻學習教程和PDF電子書!
'''import requests
from requests
.exceptions
import RequestException
import tkinter
as tk
from tkinter
import ttk
from bs4
import BeautifulSoup
import bs4
from tkinter
import *
from tkinter
.filedialog
import askdirectory
import os
class DB():def __init__(self
):self
.window
= tk
.Tk
() self
.window
.title
("Crawler Pics") self
.menu
= ttk
.Combobox
(self
.window
,width
=6)self
.path
= StringVar
()self
.lab1
= tk
.Label
(self
.window
, text
= "目標路徑:")self
.lab2
= tk
.Label
(self
.window
, text
="選擇分類:")self
.lab3
= tk
.Label
(self
.window
, text
="爬取頁數:")self
.page
= tk
.Entry
(self
.window
, width
=5) self
.input = tk
.Entry
(self
.window
, textvariable
= self
.path
, width
=80) self
.info
= tk
.Text
(self
.window
, height
=20) self
.menu
['value'] = ('大胸妹','小翹臀', '黑絲襪', '美腿控', '有顏值','大雜燴')self
.menu
.current
(0)self
.t_button
= tk
.Button
(self
.window
, text
='選擇路徑', relief
=tk
.RAISED
, width
=8, height
=1, command
=self
.select_Path
)self
.t_button1
= tk
.Button
(self
.window
, text
='爬取', relief
=tk
.RAISED
, width
=8, height
=1,command
=self
.download
)self
.c_button2
= tk
.Button
(self
.window
, text
='清空輸出', relief
=tk
.RAISED
,width
=8, height
=1, command
=self
.cle
)def gui_arrang(self
):"""完成頁面元素布局,設置各部件的位置"""self
.lab1
.grid
(row
=0,column
=0)self
.lab2
.grid
(row
=1, column
=0)self
.menu
.grid
(row
=1, column
=1,sticky
=W
)self
.lab3
.grid
(row
=2, column
=0,padx
=5,pady
=5,sticky
=tk
.W
)self
.page
.grid
(row
=2, column
=1,sticky
=W
)self
.input.grid
(row
=0,column
=1)self
.info
.grid
(row
=3,rowspan
=5,column
=0,columnspan
=3,padx
=15,pady
=15)self
.t_button
.grid
(row
=0,column
=2,padx
=5,pady
=5,sticky
=tk
.W
)self
.t_button1
.grid
(row
=1,column
=2)self
.c_button2
.grid
(row
=0,column
=3,padx
=5,pady
=5,sticky
=tk
.W
)def get_cid(self
):"""選擇爬取圖片類型"""category
= {'DX': 2,'XQT': 6,'HSW': 7,'MTK': 3,'YYZ': 4,'DZH': 5}cid
= Noneif self
.menu
.get
() == "大胸妹":cid
= category
["DX"]elif self
.menu
.get
() == "小翹臀":cid
= category
["XQT"]elif self
.menu
.get
() == "黑絲襪":cid
= category
["HSW"]elif self
.menu
.get
() == "美腿控":cid
= category
["MTK"]elif self
.menu
.get
() == "有顏值":cid
= category
["YYZ"]elif self
.menu
.get
() == "大雜燴":cid
= category
["DZH"]return cid
def select_Path(self
):"""選取本地路徑"""path_
= askdirectory
()self
.path
.set(path_
)def get_html(self
, url
, header
=None):"""請求初始url"""response
= requests
.get
(url
, headers
=header
)try:if response
.status_code
== 200:return response
.text
return Noneexcept RequestException
:print("請求失敗")return Nonedef parse_html(self
, html
, list_data
):"""提取img的名稱和圖片url,并將名稱和圖片地址以字典形式返回"""soup
= BeautifulSoup
(html
, 'html.parser')img
= soup
.find_all
('img')for t
in img
:if isinstance(t
, bs4
.element
.Tag
):name
= t
.get
('alt')img_src
= t
.get
('src')list_data
.append
([name
, img_src
])dict_data
= dict(list_data
)return dict_data
def get_image_content(self
, url
):"""請求圖片url,返回二進制內容"""print("正在下載", url
)self
.info
.insert
('end',"正在下載:"+url
+'\n')try:r
= requests
.get
(url
)if r
.status_code
== 200:return r
.content
return Noneexcept RequestException
:return Nonedef download(self
):base_url
= 'https://www.buxiuse.com/?'for i
in range(1, int(self
.page
.get
())+1):url
= base_url
+ 'cid=' + str(self
.get_cid
()) + '&' + 'page=' + str(i
)header
= {'Accept': 'text/html,application/xhtml+xml,application/xml;q = 0.9, image/webp,image/apng,*/*;q=''0.8','Accept-Encoding': 'gzip,deflate,br','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Cache-Control': 'max-age=0','Connection': 'keep-alive','Host': 'www.dbmeinv.com','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0(WindowsNT6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/''70.0.3538.102Safari/537.36 '}list_data
= []html
= self
.get_html
(url
)dictdata
= self
.parse_html
(html
, list_data
)root_dir
= self
.input.get
()case_list
= ["大胸妹", "小翹臀", "黑絲襪", "美腿控", "有顏值", "大雜燴"]for t
in case_list
:if not os
.path
.exists
(root_dir
+ '/pics'):os
.makedirs
(root_dir
+ '/pics')if not os
.path
.exists
(root_dir
+ '/pics/' + str(t
)):os
.makedirs
(root_dir
+ '/pics/' + str(t
))if self
.menu
.get
() == "大胸妹":save_path
= root_dir
+ '/pics/' + '大胸妹'for t
in dictdata
.items
():try:file_path
= save_path
+ '/' + t
[0] + 'q' + '.jpg'if not os
.path
.exists
(file_path
): with open(file_path
, 'wb') as f
:f
.write
(self
.get_image_content
(t
[1]))f
.close
()print('文件保存成功')except FileNotFoundError
:continueelif self
.menu
.get
() == "小翹臀":save_path
= root_dir
+ '/pics/' + '小翹臀'for t
in dictdata
.items
():try:file_path
= save_path
+ '/' + t
[0] + 'q' + '.jpg'if not os
.path
.exists
(file_path
): with open(file_path
, 'wb') as f
:f
.write
(self
.get_image_content
(t
[1]))f
.close
()print('文件保存成功')except FileNotFoundError
:continueelif self
.menu
.get
() == "黑絲襪":save_path
= root_dir
+ '/pics/' + '黑絲襪'for t
in dictdata
.items
():try:file_path
= save_path
+ '/' + t
[0] + 'q' + '.jpg'if not os
.path
.exists
(file_path
): with open(file_path
, 'wb') as f
:f
.write
(self
.get_image_content
(t
[1]))f
.close
()print('文件保存成功')except FileNotFoundError
:continueelif self
.menu
.get
() == "美腿控":save_path
= root_dir
+ '/pics/' + '美腿控'for t
in dictdata
.items
():try:file_path
= save_path
+ '/' + t
[0] + 'q' + '.jpg'if not os
.path
.exists
(file_path
): with open(file_path
, 'wb') as f
:f
.write
(self
.get_image_content
(t
[1]))f
.close
()print('文件保存成功')except FileNotFoundError
:continueelif self
.menu
.get
() == "有顏值":save_path
= root_dir
+ '/pics/' + '有顏值'for t
in dictdata
.items
():try:file_path
= save_path
+ '/' + t
[0] + 'q' + '.jpg'if not os
.path
.exists
(file_path
): with open(file_path
, 'wb') as f
:f
.write
(self
.get_image_content
(t
[1]))f
.close
()print('文件保存成功')except OSError
:continueelif self
.menu
.get
() == "大雜燴":save_path
= root_dir
+ '/pics/' + '大雜燴'for t
in dictdata
.items
():try:file_path
= save_path
+ '/' + t
[0] + 'q' + '.jpg'if not os
.path
.exists
(file_path
): with open(file_path
, 'wb') as f
:f
.write
(self
.get_image_content
(t
[1]))f
.close
()print('文件保存成功')except FileNotFoundError
:continuedef cle(self
):"""定義一個函數,用于清空輸出框的內容"""self
.info
.delete
(1.0,"end") def main():t
= DB
()t
.gui_arrang
()tk
.mainloop
()if __name__
== '__main__':main
()
總結
以上是生活随笔為你收集整理的python爬虫:做一个界面爬虫小软件的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。