from urllib.parse import quote
import string
import re
from urllib import request
import urllib.request
word =input('關鍵詞:')
url ='http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+ word +'&ct=201326592&v=flip'
url = quote(url, safe=string.printable)# # 解決ascii編碼報錯問題,不報錯則可以注釋掉#模擬成瀏覽器
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]#將opener安裝為全局
urllib.request.install_opener(opener)#讀取網頁url_request=request.Request(url)
url_response = request.urlopen(url_request,timeout=10)# 請求數據,可以和上一句合并.表示一次http訪問請求的時間最多10秒,一旦超過,本次請求中斷,但是不進入下一條,而是繼續重復請求這一條
html = url_response.read().decode('utf-8')# 加編碼,重要!轉換為字符串編碼,read()得到的是byte格式的。jpglist = re.findall('"thumbURL":"(.*?)",',html,re.S)#re.S將字符串作為整體,在整體中進行匹配。,thumbURL可以匹配其他格式的圖print(len(jpglist))
n =1for each in jpglist:print(each)try:request.urlretrieve(each,'D:\\deeplearn\\xuexicaogao\\圖片\\%s.jpg'%n)#爬下載的圖片放置在提前建好的文件夾里except Exception as e:print(e)finally:print('下載完成。')n+=1if n==90:breakprint('結束')
代碼解析 爬蟲報錯UnicodeEncodeError: ‘ascii’ codec can’t encode characters in position 45-47: ordinal not… 原因 python 默認的編碼是ascii,當程序中出現非ascii編碼時,python的處理常常會報這樣的錯UnicodeDecodeError: ‘ascii’ codec can’t decode byte 0x?? in position 1: ordinal not in range(128),python沒辦法處理非ascii編碼的,此時需要自己設置將python的默認編碼,一般設置為utf8的編碼格式。 使用urllib.parse.quote進行轉換。
結果文件夾
代碼版本2 語言python
import urllib
import urllib.request
from urllib.parse import quote
import re
import osheaders ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36","referer":"https://image.baidu.com"}print("****************************************************************************************")
keyword =input("請輸入要下載的圖片:")
last_dir ="C://Users//Shineion//Desktop//爬蟲圖"dir="C://Users//Shineion//Desktop//爬蟲圖//"+ keyword
if os.path.exists(last_dir):if os.path.exists(dir):print("文件夾已經存在")else:os.mkdir(dir)print(dir+"已經創建成功")else:os.mkdir(last_dir)if os.path.exists(dir):print("文件夾已經存在")else:os.mkdir(dir)print(dir+"已經創建成功")
keyword1 = quote(keyword, encoding="utf-8")
url ='http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+ keyword1 +'&ct=201326592&v=flip'
req = urllib.request.Request(url, headers=headers)
f = urllib.request.urlopen(req).read().decode("utf-8")
key = r'thumbURL":"(.+?)"'
key1 = re.compile(key)
num =0for string in re.findall(key1, f):print("正在下載"+ string)f_req = urllib.request.Request(string, headers=headers)f_url = urllib.request.urlopen(f_req).read()fs =open(dir+"/"+ keyword +str(num)+".jpg","wb+")fs.write(f_url)fs.close()num +=1print(string +"已下載成功")input("按任意鍵結束程序:")