當(dāng)前位置：首頁(yè) > 编程语言 > python >内容正文

python

python 携程_python 携程爬虫开发笔记

發(fā)布時(shí)間：2025/3/15 python 29 豆豆

生活随笔收集整理的這篇文章主要介紹了 python 携程_python 携程爬虫开发笔记小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

前言

最近購(gòu)買了《Python3 爬蟲、數(shù)據(jù)清洗與可視化實(shí)戰(zhàn)》，剛好適逢暑假，就嘗試從攜程頁(yè)面對(duì)廣州的周邊游產(chǎn)品進(jìn)行爬蟲數(shù)據(jù)捕捉。

因?yàn)椴艑W(xué)Python不夠一個(gè)星期，python的命名規(guī)范還是不太了解，只能套用之前iOS開發(fā)的命名規(guī)范，有不足之處請(qǐng)多多指點(diǎn)

一、前期

1.主要用到的庫(kù)

from bs4 import BeautifulSoup

import time

import re #正則表達(dá)式

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains #瀏覽器操作

import xlrd

import xlwt

from xlutils.copy import copy

import os

BeautifulSoup：用于對(duì)標(biāo)簽等數(shù)據(jù)進(jìn)行定位和抓取

selenium：用于啟動(dòng)瀏覽器和對(duì)頁(yè)面進(jìn)行自動(dòng)操作

time：暫停等待操作

xlrd、xlwt、xlutils：對(duì)數(shù)據(jù)結(jié)果進(jìn)行Excel讀寫保存操作

2.核心思路

1，跳進(jìn)出發(fā)點(diǎn)的周邊游頁(yè)面(廣州)

2，在首頁(yè)捕捉推薦的熱門目的地和熱點(diǎn)景點(diǎn)，進(jìn)行保存

3，針對(duì)目的地地點(diǎn)進(jìn)行遍歷搜索所展示的旅游產(chǎn)品

4，產(chǎn)品數(shù)據(jù)參數(shù)抓取

5，數(shù)據(jù)保存

6，退出瀏覽器

二、代碼

1.啟動(dòng)瀏覽器

def setupDriverSetting():

global driver

# url = 'http://m.ctrip.com/restapi/soa2/10290/createclientid?systemcode=09&createtype=3&conte'#獲取cookieID

# 手機(jī)端

# url = 'https://m.ctrip.com/webapp/vacations/tour/list?tab=64&kwd=%E7%8F%A0%E6%B5%B7&salecity=32&searchtype=tour&sctiy=32'

# 電腦端

url = 'https://weekend.ctrip.com/around/'

# 設(shè)置用chrome啟動(dòng)

driver = webdriver.Chrome()

# #設(shè)置fireFox請(qǐng)求頭參數(shù)

# profile = webdriver.FirefoxProfile()

# user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0"

# profile.set_preference("general.useragent.override",user_agent)

# driver = webdriver.Firefox(profile)

driver.get(url)

用webdriver啟動(dòng)Chrome或者fireFox，并跳進(jìn)首頁(yè)URL

2.選擇出發(fā)點(diǎn)城市

def select_StartPlace(startPlace):

#點(diǎn)擊出發(fā)點(diǎn)view

driver.find_element_by_xpath("//*[@id='CitySelect']").click()

#選擇出發(fā)點(diǎn)

cityList = driver.find_elements_by_xpath("//*[@id='CitySelect']/dd/ul")

for link in cityList:

links = link.find_elements(By.TAG_NAME,"a")

for eachCity in links:

cityStr = eachCity.text

if cityStr == startPlace:

print("找到目標(biāo)城市:"+eachCity.get_attribute('href'))

driver.get(eachCity.get_attribute('href'))

time.sleep(2)

try:

WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='SearchText']")))

except:

print('出發(fā)地頁(yè)面加載不成功')

break

主要是用find_element_by_xpath尋找目標(biāo)城市進(jìn)行選擇篩選，然后跳到城市專頁(yè)

3.搜索目的地

def finAllDestinationPage():

#查找總數(shù)組

destType = driver.find_element_by_id("J_sub_circum")#id 決定產(chǎn)品范圍(周邊游，境外游)

print(destType.text)

destType1 = destType.find_element_by_class_name("side_jmp_dest")

destTypeItem = destType1.get_attribute('innerHTML')

item = BeautifulSoup(destTypeItem,'lxml')

destTypeList = item.find_all('li')

allDestinationListDic = {}

for each in destTypeList:

typeName = each.h4.string

typeList = each.find_all('a')

list = []

for i in typeList:

list.append(i.string)

allDestinationListDic[typeName] = list

return allDestinationListDic

搜索所有可推薦目的地和景點(diǎn)，并用字典保存

4.旅游產(chǎn)品列表頁(yè)

def jump_destinationPage(startPlace,destination):

#定位搜索欄

try:

WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='SearchText']")))

except:

print('查找不到搜索欄')

finally:

print('本地頁(yè)面加載完畢')

driver.find_element_by_xpath("//input[@id='SearchText']").send_keys(destination)

print("輸入目的地："+destination)

driver.find_element_by_xpath("//*[@id='SearchBtn']").click()

print("點(diǎn)擊搜索按鈕結(jié)束")

time.sleep(2)

try:

WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='js-dpSearcher']")))

except:

print('產(chǎn)品列表頁(yè)加載不成功')

finally:

print('產(chǎn)品列表頁(yè)加載完畢')

#再選一次出發(fā)地，以防出錯(cuò)

reSelect_StartPlace(startPlace)

#搜索頁(yè)數(shù)

pageHtml = driver.find_element_by_xpath("//*[@id='_sort']/div/span")

print(pageHtml.text)

pageNumStr = pageHtml.text

pageNumStr = pageNumStr[:-1]

print("獲取的num:" + pageNumStr)

#正則表達(dá)式查找頁(yè)數(shù)

pageNumS = re.findall(r'\d+',pageNumStr)

pageNum = int(pageNumS[1])

print(pageNum)

tourProductList = []

for i in range(0,pageNum):

itemList = showCurrentPageAllData()

#收集數(shù)據(jù)

for j in range(0,len(itemList)):

eachItem = collectCurrentPageEachData(j)

tourProductList.append(eachItem)

#點(diǎn)擊下一頁(yè)

driver.find_element_by_xpath("//input[@id='ipt_page_txt']").clear()

driver.find_element_by_xpath("//input[@id='ipt_page_txt']").send_keys(str(i+2))

driver.find_element_by_xpath("//*[@id='ipt_page_btn']").click()

print("點(diǎn)擊下一頁(yè)結(jié)束->"+str(i+2)+"頁(yè)")

time.sleep(2)

return driver

跳進(jìn)產(chǎn)品頁(yè)，并根據(jù)標(biāo)簽，抓取總頁(yè)數(shù)，在遍歷所有旅游產(chǎn)品后，再跳到下一頁(yè)進(jìn)行循環(huán)遍歷

5.產(chǎn)品數(shù)據(jù)抓取

def collectCurrentPageEachData(itemNum):

itemList = driver.find_elements_by_class_name("product_box")

str = itemList[itemNum].get_attribute('innerHTML')#轉(zhuǎn)換成字符串

# item = BeautifulSoup(str,"html.parser")#獲取item的soup對(duì)象

item = BeautifulSoup(str, "lxml") # 獲取item的soup對(duì)象

# print("+++++++"+item.prettify())

# 解析

#產(chǎn)品名稱

titleNameHtml = item.find('h2',class_= 'product_title')

print("-------"+titleNameHtml.get_text())

productName = titleNameHtml.get_text()

#產(chǎn)品鏈接

productLink = titleNameHtml.a['href']

productLink = productLink[2:]

productLink = "https://"+productLink

print("link:" + productLink)

#產(chǎn)品類型

productType = item.find('em')

print("type:"+productType.get_text())

productTypeStr = productType.get_text()

#產(chǎn)品價(jià)格

priceHtml = item.find('span',class_='sr_price')

priceStr = priceHtml.strong.get_text()

#判斷是否為數(shù)字

if priceStr.isdigit() == True :

priceStr = "%.2f"%float(priceStr)

print("price:"+priceStr)

#產(chǎn)品供應(yīng)商

productRetail = item.find('p',class_='product_retail')

productRetailStr = productRetail['title']

if "供應(yīng)商" in productRetailStr:

productRetailStr = productRetailStr[4:]

print("retail:" + productRetailStr)

#產(chǎn)品評(píng)分

try :

gradeHtml = item.find('p', class_='grade')

gradeStr = gradeHtml.strong.get_text()

print("grade:" + gradeStr)

except:

print('查找不到評(píng)分')

gradeStr = ''

# 產(chǎn)品人數(shù)

try:

commentHtml = item.find('div', class_='comment')

commentStr = commentHtml.em.get_text()

commentNumS = re.findall(r'\d+', commentStr)

commentNum = int(commentNumS[0])

print("comment:",commentNum)

except:

print('查找不到出游人數(shù)')

commentNum = ''

return {

'名稱':productName,

'鏈接':productLink,

'類型':productTypeStr,

'價(jià)格':priceStr,

'供應(yīng)商':productRetailStr,

'評(píng)分':gradeStr,

'人數(shù)':commentNum,

}

在產(chǎn)品頁(yè)面上獲取所有可見信息，并返回

6.數(shù)據(jù)保存

class ExcelFileManager:

def creatExcelFile(fileName,sheetName,headRowList):

# 獲取項(xiàng)目所在目錄

filePath = os.getcwd() + '/' + fileName + '.xls'

#如果不存在就新增

try:

oldFile = xlrd.open_workbook(filePath)

file = copy(oldFile)

except:

file = xlwt.Workbook()

print("新建文件")

#如果不存在就新增

try:

sheet1 = file.add_sheet(sheetName,cell_overwrite_ok=True)

except:

sheet1 = file.get_sheet(sheetName)

#設(shè)置style樣式

head_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00')

row0 = headRowList

for i in range(0,len(row0)):

sheet1.write(0,i,row0[i],head_style)

print(filePath)

file.save(filePath)

def addDataToExcelFile(fileName,sheetName,dataList):

filePath = os.getcwd()+'/'+fileName+'.xls'

file = xlrd.open_workbook(filePath)

#已存在的行數(shù)

newRows = file.sheet_by_name(sheetName).nrows

new_File = copy(file)

sheet = new_File.get_sheet(sheetName)

try:

for i in range(0,len(dataList)):

for j in range(0,len(dataList[i])):

sheet.write(i+newRows,j,dataList[i][j])

except Exception as e:

print(e)

new_File.save(filePath)

Excel文件創(chuàng)建與保存數(shù)據(jù)，不得不說(shuō)，python對(duì)Excel支持不是很友好，xlrd和xlwt僅支持讀和寫，不支持增加sheet或者在原有Excel文件上添加數(shù)據(jù)等操作，需要用到第三方庫(kù)

三、抓取結(jié)果：

1530848043475.jpg

總結(jié)

以上是生活随笔為你收集整理的python 携程_python 携程爬虫开发笔记的全部?jī)?nèi)容，希望文章能夠幫你解決所遇到的問(wèn)題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇： STM32H743+CubeMX-QSP
下一篇： c++ 计算正弦的近似值_一篇文章搞懂正