Python爬取网上车市[http://www.cheshi.com/]的数据
生活随笔
收集整理的這篇文章主要介紹了
Python爬取网上车市[http://www.cheshi.com/]的数据
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
#coding:utf8
#爬取網上車市[http://www.cheshi.com/]的數據
import requests, json, time, re, os, sys, time,urllib2,shutil,string
import threading
import MySQLdb
import redis
from pyquery import PyQuery as pq
from urlparse import urljoin
from selenium import webdriver#設置utf-8編碼格式
reload(sys)
sys.setdefaultencoding( "utf-8" )#讀取文件內容
def getLines(filename):file_object = open(filename,'rb')lines = file_object.readlines()return lines#根據url_name獲取url_type_id
def get_url_type_id(v_url_name):#3000 品牌 奧迪#4000 奧迪 奧迪A6url_type_id = ''for line in getLines('/home/shutong/crawl/car/script/brand.ini'):line = line.strip()url_cate = line.split(',')[1]url_name = line.split(',')[2]if v_url_name.strip() == url_name.strip():url_type_id = line.split(',')[0]return url_type_idbreakelse :continuereturn url_type_idclass ResultData():'''數據文件類'''def __init__(self,industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id):self.industry_id = industry_idself.url_type_id = url_type_idself.url_name = url_nameself.url_value = url_valueself.web_type_id = web_type_idself.web_name = web_nameself.date_id = date_iddef __str__(self):return self.industry_id,self.url_type_id,self.url_name,self.url_value,self.web_type_id,self.self.web_name,ResultData.date_idclass Base(object):'''文件保存的基類'''def __init__(self,dev_prd_flag):self.dev_prd_flag = dev_prd_flagpass #print "This is init function"#保存數據到文件文件def _saveContext(self,filename,*name):format = '^'context = name[0]for i in name[1:]:context = context + format + str(i)context = str(context).replace('(','(').replace(')',')').replace(',',',').replace(':',':')if self.dev_prd_flag != 'prd':print contextelse:#去除文件路徑名首位空格filename = filename.strip()#讀取目錄名稱path = os.path.dirname(filename)#如果目錄不存在則創建目錄if not os.path.exists(path):os.makedirs(path) #讀取文件名稱,以追加的方式寫文件name = os.path.basename(filename)fp = open(filename,'a')fp.write(context+'\n')fp.close()def saveData(self,filename,ResultData):if ResultData.url_type_id:self._saveContext(filename,ResultData.industry_id,ResultData.url_type_id,ResultData.url_name,ResultData.url_value,ResultData.web_type_id,ResultData.web_name,ResultData.date_id)else:#將數據進行保存在redis中r = redis.Redis(host='192.168.122.140',port=6379,db=0)r.sadd('errorList',ResultData.industry_id+'^'+ResultData.url_name+'^'+ResultData.url_value)def __str__(self):return '保存文件的基類'class Crawl(Base):'''爬蟲基礎類'''driver = None#構造方法def __init__(self,name,dev_prd_flag):super(Crawl,self).__init__(dev_prd_flag='dev')self.dev_prd_flag = dev_prd_flagself.name = name#self.driver = init_driver()'''初始化啟動瀏覽器'''def init_driver(self):ua = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.3 Safari/537.36"cap = webdriver.DesiredCapabilities.PHANTOMJScap["phantomjs.page.settings.resourceTimeout"] = 20000cap["phantomjs.page.settings.loadImages"] = Truecap["phantomjs.page.settings.disk-cache"] = Truecap["phantomjs.page.settings.userAgent"] = uacap["phantomjs.page.customHeaders.User-Agent"] =uacap["phantomjs.page.customHeaders.Referer"] = "http://tj.ac.10086.cn/login/"driver = webdriver.PhantomJS(executable_path='/home/shutong/phantomjs/bin/phantomjs',desired_capabilities=cap, service_args=['--ignore-ssl-errors=true'])driver.set_page_load_timeout(60) driver.set_script_timeout(60)#return driverself.driver = driver#獲取網頁文本def getHtml(self,url,code='utf-8'):html = ''try:if self.driver:self.driver.get(url)html = self.driver.page_sourceelse :headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} request = urllib2.Request(url,headers=headers)response = urllib2.urlopen(request,data=None,timeout=60)if code:if code == 'gbk':html = unicode(response.read(),'gbk')else:html = unicode(response.read(),str(code))except:passfinally:return html '''析構方法'''def __del__(self):if self.driver:self.driver.quit()print "瀏覽器成功關閉"else:print "瀏覽器未打開使用"def __str__(self):return "爬蟲基礎類"def start_crawl(url):#連接redis數據庫r = redis.Redis(host='192.168.122.140',port=6379,db=0)urllist = []html = crawl.getHtml(url,'gbk')d = pq(html)for a in d('a'):a = pq(a)try:url_value = urljoin(url,a.attr('href'))name = a.text()#if re.compile(r'([a-z]+) ([a-z]+)', re.I)#http://newcar.xcar.com.cn/162/if re.match( r'http://newcar.xcar.com.cn/[0-9]{1,10}/$', url_value, re.M|re.I):#print url_value,name#urllist.append(url_value)#將數據存儲在redis中#r.sadd('urllist',url_value)pass elif re.match(r'http://newcar.xcar.com.cn/m[0-9]{1,10}/$',url_value,re.M|re.I):r.sadd('urllist',url_value)except:pass #for index in urllist:for index in list(set(urllist)):print indextry:#return start_crawl(index)pass except:pass def start_wscs_crawl(url):#生產或者測試標志 dev為測試 prd為生產flag = 'prd'#汽車行業IDindustry_id = '004004'#移動PC端web_type_idweb_type_id = '0'#網站名稱web_name = '網上車市'crawl = Crawl('網上車市',flag)#加載瀏覽器#crawl.init_driver()html = crawl.getHtml(url)d = pq(html)for div in d('div').filter('.list-box'):div = pq(div)#品牌brand = div('div').filter('.lb').find('span').text()#品牌urlbrand_url = urljoin(url,div('div').filter('.lb')('a').attr('href'))#print brand,brand_urlurl_type_id = '3000'url_name = brandurl_value = brand_url#保存品牌數據#crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name)resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id)crawl.saveData(filename,resultData)brand = div('div').filter('.rb')('dl')('dt')('a').text().replace('>>','')brand_url = urljoin(url,div('div').filter('.rb')('dl')('dt')('a').attr('href'))#print brand,brand_urlurl_type_id = '3000'url_name = brandurl_value = brand_url#保存品牌數據 resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id)crawl.saveData(filename,resultData)#crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name)for dd in div('div').filter('.rb')('dl')('dd'):dd = pq(dd)car_name = dd('div').filter('.con')('h4').text()car_url = urljoin(url,dd('div').filter('.con')('h4')('a').attr('href'))#print car_name,car_urlurl_type_id = get_url_type_id(car_name)url_name = car_nameurl_value = car_url #保存車系數據#crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name)
resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id)crawl.saveData(filename,resultData)#制作汽車實體信息#品牌 子品牌 車系名稱 價位 圖片url 網站名稱 #多線程啟動
def start_mutli_crawl():list = []for word in string.uppercase:#url = 'http://www.autohome.com.cn/grade/carhtml/%s.html' %(word)url = 'http://product.cheshi.com/static/selectcar/%s.html?t=1519713137030' % (word)list.append(url)#定義線程數組threads = []#創建線程for i in range(len(list)):t = threading.Thread(target=start_wscs_crawl,args=(list[i],))threads.append(t)#開啟線程for i in range(len(list)):threads[i].start()for i in range(len(list)):threads[i].join()#filename = '/home/shutong/crawl/car/script/wscs.csv'
#date_id = '20180227'
date_id = sys.argv[1]
filename = sys.argv[2]
#url = 'http://product.cheshi.com/static/selectcar/B.html?t=1519713137030'
#start_wscs_crawl(url)
#多線程啟動
start_mutli_crawl()
?
轉載于:https://www.cnblogs.com/Jims2016/p/8554928.html
總結
以上是生活随笔為你收集整理的Python爬取网上车市[http://www.cheshi.com/]的数据的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Java分布式中文分词组件 - word
- 下一篇: 圆心科技再冲刺港交所上市:收入和亏损同增