抓货车网的图片
# _*_ coding:UTF-8 _*_
# 開發作者 : ZhangRong z00520111
# 開發時間 : 2020/3/28 10:09
# 文件名稱 : catchhuoche.py
# 開發工具 : PyCharm
# Description:
# Copyright @ Huawei Technologies Co., Ltd. 2019-2020. All rights reserved.# -*- coding: utf-8 -*-
import re
import requests
from pyquery import PyQuery as pq
from getcookie import excuteScript
import time, random
import json
import osrequests.packages.urllib3.disable_warnings()# str(content).encode('ISO-8859-1').decode('utf-8')
carbrandlist = ['東風', '一汽', '江淮', '三環', '江鈴', '重汽', '福田', '陜汽', '上汽', '凱馬', '長安']
image_num = 0
car_num = 0 # 當前是第幾輛車
prepath = 'E:/pictures/'
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9','Cache-Control': 'max-age=0','Connection': 'keep-alive','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}class HuoCheCrawler():def __init__(self):proxy_list = [#代理設置]# http=random.choice(proxy_list),# https=random.choice(proxy_list)proxies = {"http": random.choice(proxy_list),"https": random.choice(proxy_list)}# print(http)self.baseurl = 'https://www.hcj198.com'self.sess = requests.Session()self.sess.headers = headersself.sess.proxies = proxiesself.start_url = 'https://www.hcj198.com/car.html'# def anti_value(self):# '''# 獲取antipas參數需要的key和value# :return:# '''# content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')# params = re.findall(r"value=anti\('(.*?)','(.*?)'\)", content)[0]# return params# def caculate_antipas(self):# '''# 計算antipas參數# :return:# '''# params = self.anti_value()# antipas = excuteScript(params[0], params[1])# self.sess.cookies.set('antipas', antipas)def page_url(self):# self.caculate_antipas()'''獲取翻頁鏈接:param start_url::return:'''content = pq(self.sess.get(self.start_url, verify=False).text)# print(content)page_num_max = max([int(each.text()) for each in content('div[@class="page-center search_list_one"] ul[@class="pagination"] > li > a').items() ifre.match(r'\d+', each.text())])page_url_list = []for i in range(1, page_num_max + 1, 1):base_url = 'https://www.hcj198.com/car.html?page={}'.format(i)# print("第 %d 頁", i)# print(base_url)page_url_list.append(base_url)return page_url_listdef index_page(self, start_url):'''抓取詳情頁鏈接:param start_url::return:'''# print(start_url)content = pq(self.sess.get(start_url).text)# print('$' * 200)# print(content)for each in content('ul[@class="car-ul"] > li > a').items():# print("each is ",each)url = each.attr.href# print("url is ",url)if not url.startswith('http'):url = self.baseurl + urlyield urldef detail_page(self, detail_url):'''抓取詳情信息:param detail_url::return:'''content = pq(self.sess.get(detail_url).text, parser="html")# print("content is ", content)# tem1 = str(tem('img'))# pattern = r'data-src=["](.*?)["]'# result = re.findall(pattern, tem1)detail = content('ul[@class="tages-param"] li div').text()eachDetail = detail.split(' ')tem = content('div[@class="tages-img-list"]')# print("tem is ", tem)tem1 = str(tem('div'))# print("tem1 is ",tem1)pattern = r'url\("(.*?)"'result = re.findall(pattern, tem1)# print("result is ",result)name = content('div[@class="pro-title-cmodel"]').text().strip()for brand in carbrandlist:carbrand = brandif name.find(brand) != -1:break# content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')price = content('div[@class="detail-left-dprice"] div[@class="dprice-left"]').text()data_dict = {'name': name,'carbrand': carbrand,'bordingdate': eachDetail[0],'km': eachDetail[3],'displacement': eachDetail[4],'carstyle': eachDetail[1],'price': price[price.index('¥') + 1:],'image': result}if not data_dict['name']:print(str(content).encode('ISO-8859-1').decode('utf-8'))return data_dict, resultdef request_download(self, https, carbrand):global car_numproxy_list = [
#代理設置]# http=random.choice(proxy_list),# https=random.choice(proxy_list)proxies = {"http": random.choice(proxy_list),"https": random.choice(proxy_list)}global image_num# print("http is ",https)r = requests.get(https, proxies=proxies, verify=False)with open(prepath + carbrand + '/' + carbrand + str(car_num - 1) + '/' + carbrand + str(car_num - 1) + '_' + str(image_num) + '.png', 'wb') as f:f.write(r.content)image_num = image_num + 1def run(self):global car_numfor pageurl in self.page_url():for detail_url in self.index_page(pageurl):# print("datail is ", detail_url)listout, result = self.detail_page(detail_url)data_string = json.dumps(listout, ensure_ascii=False)carbrand = listout['carbrand']filename = carbrand + str(car_num)isExists = os.path.exists(prepath + carbrand + '/' + filename + '/')# 判斷結果if not isExists:# 如果不存在則創建目錄# 創建目錄操作函數os.makedirs(prepath + carbrand + '/' + filename + '/')file = open(prepath + carbrand + '/' + filename + '/' + filename + ".txt", "a+", encoding='utf-8')file.write(data_string)file.close()car_num = car_num + 1print("list is ", listout)stop = 0for https in result:if stop == 7:breakself.request_download(self.baseurl + https, carbrand)stop = stop + 1print("暫停5-15秒,防止被關小黑屋")time.sleep(random.randint(5, 15))print('*' * 200)if __name__ == '__main__':hccrawler = HuoCheCrawler()hccrawler.run()
總結
- 上一篇: cad-pol:正多边形命令学习
- 下一篇: Windows 10 Edt LTSC中