python 携程_python 携程爬虫开发笔记
前言
最近購(gòu)買了《Python3 爬蟲、數(shù)據(jù)清洗與可視化實(shí)戰(zhàn)》,剛好適逢暑假,就嘗試從攜程頁(yè)面對(duì)廣州的周邊游產(chǎn)品進(jìn)行爬蟲數(shù)據(jù)捕捉。
因?yàn)椴艑W(xué)Python不夠一個(gè)星期,python的命名規(guī)范還是不太了解,只能套用之前iOS開發(fā)的命名規(guī)范,有不足之處請(qǐng)多多指點(diǎn)
一、前期
1.主要用到的庫(kù)
from bs4 import BeautifulSoup
import time
import re #正則表達(dá)式
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains #瀏覽器操作
import xlrd
import xlwt
from xlutils.copy import copy
import os
BeautifulSoup:用于對(duì)標(biāo)簽等數(shù)據(jù)進(jìn)行定位和抓取
selenium:用于啟動(dòng)瀏覽器和對(duì)頁(yè)面進(jìn)行自動(dòng)操作
time:暫停等待操作
xlrd、xlwt、xlutils:對(duì)數(shù)據(jù)結(jié)果進(jìn)行Excel讀寫保存操作
2.核心思路
1,跳進(jìn)出發(fā)點(diǎn)的周邊游頁(yè)面(廣州)
2,在首頁(yè)捕捉推薦的熱門目的地和熱點(diǎn)景點(diǎn),進(jìn)行保存
3,針對(duì)目的地地點(diǎn)進(jìn)行遍歷搜索所展示的旅游產(chǎn)品
4,產(chǎn)品數(shù)據(jù)參數(shù)抓取
5,數(shù)據(jù)保存
6,退出瀏覽器
二、代碼
1.啟動(dòng)瀏覽器
def setupDriverSetting():
global driver
# url = 'http://m.ctrip.com/restapi/soa2/10290/createclientid?systemcode=09&createtype=3&conte'#獲取cookieID
# 手機(jī)端
# url = 'https://m.ctrip.com/webapp/vacations/tour/list?tab=64&kwd=%E7%8F%A0%E6%B5%B7&salecity=32&searchtype=tour&sctiy=32'
# 電腦端
url = 'https://weekend.ctrip.com/around/'
# 設(shè)置用chrome啟動(dòng)
driver = webdriver.Chrome()
# #設(shè)置fireFox請(qǐng)求頭參數(shù)
# profile = webdriver.FirefoxProfile()
# user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0"
# profile.set_preference("general.useragent.override",user_agent)
#
# driver = webdriver.Firefox(profile)
driver.get(url)
用webdriver啟動(dòng)Chrome或者fireFox,并跳進(jìn)首頁(yè)URL
2.選擇出發(fā)點(diǎn)城市
def select_StartPlace(startPlace):
#點(diǎn)擊出發(fā)點(diǎn)view
driver.find_element_by_xpath("//*[@id='CitySelect']").click()
#選擇出發(fā)點(diǎn)
cityList = driver.find_elements_by_xpath("//*[@id='CitySelect']/dd/ul")
for link in cityList:
links = link.find_elements(By.TAG_NAME,"a")
for eachCity in links:
cityStr = eachCity.text
if cityStr == startPlace:
print("找到目標(biāo)城市:"+eachCity.get_attribute('href'))
driver.get(eachCity.get_attribute('href'))
time.sleep(2)
try:
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='SearchText']")))
except:
print('出發(fā)地頁(yè)面加載不成功')
break
主要是用find_element_by_xpath尋找目標(biāo)城市進(jìn)行選擇篩選,然后跳到城市專頁(yè)
3.搜索目的地
def finAllDestinationPage():
#查找總數(shù)組
destType = driver.find_element_by_id("J_sub_circum")#id 決定產(chǎn)品范圍(周邊游,境外游)
print(destType.text)
destType1 = destType.find_element_by_class_name("side_jmp_dest")
destTypeItem = destType1.get_attribute('innerHTML')
item = BeautifulSoup(destTypeItem,'lxml')
destTypeList = item.find_all('li')
allDestinationListDic = {}
for each in destTypeList:
typeName = each.h4.string
typeList = each.find_all('a')
list = []
for i in typeList:
list.append(i.string)
allDestinationListDic[typeName] = list
return allDestinationListDic
搜索所有可推薦目的地和景點(diǎn),并用字典保存
4.旅游產(chǎn)品列表頁(yè)
def jump_destinationPage(startPlace,destination):
#定位搜索欄
try:
WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='SearchText']")))
except:
print('查找不到搜索欄')
finally:
print('本地頁(yè)面加載完畢')
driver.find_element_by_xpath("//input[@id='SearchText']").send_keys(destination)
print("輸入目的地:"+destination)
driver.find_element_by_xpath("//*[@id='SearchBtn']").click()
print("點(diǎn)擊搜索按鈕結(jié)束")
time.sleep(2)
try:
WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='js-dpSearcher']")))
except:
print('產(chǎn)品列表頁(yè)加載不成功')
finally:
print('產(chǎn)品列表頁(yè)加載完畢')
#再選一次出發(fā)地,以防出錯(cuò)
reSelect_StartPlace(startPlace)
#搜索頁(yè)數(shù)
pageHtml = driver.find_element_by_xpath("//*[@id='_sort']/div/span")
print(pageHtml.text)
pageNumStr = pageHtml.text
pageNumStr = pageNumStr[:-1]
print("獲取的num:" + pageNumStr)
#正則表達(dá)式 查找頁(yè)數(shù)
pageNumS = re.findall(r'\d+',pageNumStr)
pageNum = int(pageNumS[1])
print(pageNum)
tourProductList = []
for i in range(0,pageNum):
itemList = showCurrentPageAllData()
#收集數(shù)據(jù)
for j in range(0,len(itemList)):
eachItem = collectCurrentPageEachData(j)
tourProductList.append(eachItem)
#點(diǎn)擊下一頁(yè)
driver.find_element_by_xpath("//input[@id='ipt_page_txt']").clear()
driver.find_element_by_xpath("//input[@id='ipt_page_txt']").send_keys(str(i+2))
driver.find_element_by_xpath("//*[@id='ipt_page_btn']").click()
print("點(diǎn)擊下一頁(yè)結(jié)束->"+str(i+2)+"頁(yè)")
time.sleep(2)
return driver
跳進(jìn)產(chǎn)品頁(yè),并根據(jù)標(biāo)簽,抓取總頁(yè)數(shù),在遍歷所有旅游產(chǎn)品后,再跳到下一頁(yè)進(jìn)行循環(huán)遍歷
5.產(chǎn)品數(shù)據(jù)抓取
def collectCurrentPageEachData(itemNum):
itemList = driver.find_elements_by_class_name("product_box")
str = itemList[itemNum].get_attribute('innerHTML')#轉(zhuǎn)換成字符串
# item = BeautifulSoup(str,"html.parser")#獲取item的soup對(duì)象
item = BeautifulSoup(str, "lxml") # 獲取item的soup對(duì)象
# print("+++++++"+item.prettify())
# 解析
#產(chǎn)品名稱
titleNameHtml = item.find('h2',class_= 'product_title')
print("-------"+titleNameHtml.get_text())
productName = titleNameHtml.get_text()
#產(chǎn)品鏈接
productLink = titleNameHtml.a['href']
productLink = productLink[2:]
productLink = "https://"+productLink
print("link:" + productLink)
#產(chǎn)品類型
productType = item.find('em')
print("type:"+productType.get_text())
productTypeStr = productType.get_text()
#產(chǎn)品價(jià)格
priceHtml = item.find('span',class_='sr_price')
priceStr = priceHtml.strong.get_text()
#判斷是否為數(shù)字
if priceStr.isdigit() == True :
priceStr = "%.2f"%float(priceStr)
print("price:"+priceStr)
#產(chǎn)品供應(yīng)商
productRetail = item.find('p',class_='product_retail')
productRetailStr = productRetail['title']
if "供應(yīng)商" in productRetailStr:
productRetailStr = productRetailStr[4:]
print("retail:" + productRetailStr)
#產(chǎn)品評(píng)分
try :
gradeHtml = item.find('p', class_='grade')
gradeStr = gradeHtml.strong.get_text()
print("grade:" + gradeStr)
except:
print('查找不到評(píng)分')
gradeStr = ''
# 產(chǎn)品人數(shù)
try:
commentHtml = item.find('div', class_='comment')
commentStr = commentHtml.em.get_text()
commentNumS = re.findall(r'\d+', commentStr)
commentNum = int(commentNumS[0])
print("comment:",commentNum)
except:
print('查找不到出游人數(shù)')
commentNum = ''
return {
'名稱':productName,
'鏈接':productLink,
'類型':productTypeStr,
'價(jià)格':priceStr,
'供應(yīng)商':productRetailStr,
'評(píng)分':gradeStr,
'人數(shù)':commentNum,
}
在產(chǎn)品頁(yè)面上獲取所有可見信息,并返回
6.數(shù)據(jù)保存
class ExcelFileManager:
def creatExcelFile(fileName,sheetName,headRowList):
# 獲取項(xiàng)目所在目錄
filePath = os.getcwd() + '/' + fileName + '.xls'
#如果不存在就新增
try:
oldFile = xlrd.open_workbook(filePath)
file = copy(oldFile)
except:
file = xlwt.Workbook()
print("新建文件")
#如果不存在就新增
try:
sheet1 = file.add_sheet(sheetName,cell_overwrite_ok=True)
except:
sheet1 = file.get_sheet(sheetName)
#設(shè)置style樣式
head_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00')
row0 = headRowList
for i in range(0,len(row0)):
sheet1.write(0,i,row0[i],head_style)
print(filePath)
file.save(filePath)
def addDataToExcelFile(fileName,sheetName,dataList):
filePath = os.getcwd()+'/'+fileName+'.xls'
file = xlrd.open_workbook(filePath)
#已存在的行數(shù)
newRows = file.sheet_by_name(sheetName).nrows
new_File = copy(file)
sheet = new_File.get_sheet(sheetName)
try:
for i in range(0,len(dataList)):
for j in range(0,len(dataList[i])):
sheet.write(i+newRows,j,dataList[i][j])
except Exception as e:
print(e)
new_File.save(filePath)
Excel文件創(chuàng)建與保存數(shù)據(jù),不得不說(shuō),python對(duì)Excel支持不是很友好,xlrd和xlwt僅支持讀和寫,不支持增加sheet或者在原有Excel文件上添加數(shù)據(jù)等操作,需要用到第三方庫(kù)
三、抓取結(jié)果:
1530848043475.jpg
總結(jié)
以上是生活随笔為你收集整理的python 携程_python 携程爬虫开发笔记的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: STM32H743+CubeMX-QSP
- 下一篇: c++ 计算正弦的近似值_一篇文章搞懂正