杂记——淘宝/京东 商品/评论爬取+词云图制作
生活随笔
收集整理的這篇文章主要介紹了
杂记——淘宝/京东 商品/评论爬取+词云图制作
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
文章目錄
- 京東商品評論爬取
- 京東商品信息爬取
- 淘寶商品信息
- 詞云圖制作
- 情感分析
- 關(guān)于加圓環(huán)圖中間的圖片操作
京東商品評論爬取
productPageComments
import time import requests from lxml import etree from multiprocessing.dummy import Pool from requests.exceptions import RequestException import openpyxl import json import re from urllib import parse import osos.chdir(r'C:\Users\Administrator\Desktop')def main(offset):# 構(gòu)造主函數(shù),初始化各個模塊,傳入入口URLurl = base_url.format(offset)time.sleep(1)req = requests.get(url, timeout=30, headers=headers)jd=json.loads(req.text.lstrip("fetchJSON_comme rstrip nt98vv375(").rstrip(");"))for i in jd['comments']:comments=i['content']sheet.append([comments])if __name__ == '__main__':base_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=40259347395&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1"wb = openpyxl.Workbook() # 獲取工作簿對象sheet = wb.active # 活動的工作表# 添加列名sheet.append(['評論'])# 請求頭headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36'}# 使用線程池print('多線程爬取開始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(10)])# 保存位置wb.save(r'C:\Users\Administrator\Desktop\info21.xlsx')#關(guān)閉線程池end_time=time.time()print('多線程爬取結(jié)束')print('耗時:',end_time-start_time)p.close()p.join()#用來等待進程池中的worker進程執(zhí)行完畢,防止主進程在worker進程結(jié)束前結(jié)束。京東商品信息爬取
沒有解決動態(tài)加載問題
from selenium import webdriver import timeclass JdSpider(object):def __init__(self):self.browser=webdriver.Chrome()self.url='https://www.jd.com/'self.i=0#獲取商品def get_page(self):#打開京東self.browser.get(self.url)#找兩個節(jié)點self.browser.find_element_by_xpath('//*[@id="key"]').send_keys('Python書籍')self.browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button').click()#留出時間給頁面加載time.sleep(2)#解析頁面def parse_page(self):# 把下拉菜單拉到底部,執(zhí)行JS腳本self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')time.sleep(2)#匹配所有商品節(jié)點對像列表li_list=self.browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')for li in li_list:info = li.text.split('\n')if info[0].startswith('每滿'):price=info[1]name=info[2]number=info[3]market=info[4]elif info[0]=='單價':price = info[3]name = info[4]number = info[5]market = info[6]elif info[0].startswith('¥') and info[1].startswith('¥'):price = info[0]name = info[2]number = info[3]market = info[4]else:price = info[0]name = info[1]number = info[2]market = info[3]print(price,number,market,name)self.i += 1def main(self):self.get_page()while True:self.parse_page()#判斷是否為最后一頁if self.browser.page_source.find('pn-next disabled')==-1:# 不是最后一頁,點擊下一頁self.browser.find_element_by_class_name('pn-next').click()time.sleep(3)else:breakprint(self.i)if __name__ == '__main__':spider=JdSpider()spider.main() # encoding: utf-8 # time: 2020/3/23 16:11from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from time import sleep from pyquery import PyQuery as pq#browser:瀏覽器,將瀏覽器設(shè)置為谷歌驅(qū)動, # 這里需要下載谷歌對應(yīng)的驅(qū)動,使用火狐瀏覽器安裝驅(qū)動后webdriver.Firefox() browser = webdriver.Chrome() #轉(zhuǎn)到目標(biāo)網(wǎng)站 browser.get('https://www.jd.com/') #瀏覽器等待10秒 wait = WebDriverWait(browser, 10)KEY = '泡面'def search():#解決加載超時出錯try:#input輸入框,等待加載出元素#key,#key是在搜索輸入框?qū)?yīng)的代碼右擊Copy,Copy selector,復(fù)制粘貼下來input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#key')))#submit按鈕,即搜索確認按鈕,#search > div > div.form > button獲取方式同理submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#search > div > div.form > button')))#輸入框輸入鍵,即字,輸入內(nèi)容為KEYinput.send_keys(KEY)#確認按鈕點擊submit.click()#睡眠延遲2秒,避免頻繁操作封IPsleep(2)#等待加載出底部頁面信息,第一頁,EC.text_to_be_present_in_element為判斷元素上有文本信息##J_bottomPage > span.p-num > a.curr,curr為頁寄存器,一般高亮顯示處為所在頁數(shù)wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), '1'))sleep(2)#執(zhí)行函數(shù)獲取商品信息get_products()#獲取商品頁數(shù)total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')))#print(total)#返回:<selenium.webdriver.remote.webelement.WebElement (session="fdbc8d4f1162d6d947614313fe6f032a", element="c18994bb-03a7-45aa-8554-765fdc253d30")># print(total.text)#返回:100return total.text#超時出錯時,重新執(zhí)行search()程序except TimeoutError:return search()#獲取商品信息 def get_products():##J_goodsList > ul > li:nth-child(60)為頁末最后一個商品信息,最后一個商品信息加載出來則全頁商品信息全部加載出來wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:last-child')))#獲取網(wǎng)頁源碼html = browser.page_source# print(html)# 解析doc = pq(html)# print(doc)#J_goodsList .gl-warp.clearfix .gl-item為id=J_goodsList標(biāo)簽下class='gl-warp.clearfix'# 的子標(biāo)簽下的class='gl-item'的標(biāo)簽,即每頁60個商品標(biāo)簽,有60個結(jié)果items = doc('#J_goodsList .gl-warp.clearfix .gl-item').items()#enumerate()枚舉、列舉、計算for index, i in enumerate(items):# print(i('.p-name.p-name-type-2 em').text())# print('='*30)# 運行結(jié)果:# 康師傅方便面 Express速達面館煮面 私房紅燒牛肉面*2+金牌香辣牛肉面*2 袋裝# 泡面# 禮盒整箱裝# ==============================# 京東超市統(tǒng)一 方便面 來一桶 老壇酸菜牛肉面(辣味) 12桶 整箱裝# ==============================if i('.p-img a').attr('href')[0] == '/':ss = 'http:' + i('.p-img a').attr('href')else:ss = i('.p-img a').attr('href')product = {'index': index,#J_goodsList > ul > li:nth-child(53) > div > div.p-name.p-name-type-2 > a > em'price': i('.p-price i').text(), #J_goodsList > ul > li:nth-child(53) > div > div.p-price > strong > i'name': i('.p-shop a').text(),#J_goodsList > ul > li:nth-child(53) > div > div.p-shop > span > a'text':i('.p-name.p-name-type-2 em').text(),'commit': i('.p-commit a').text(),# 'img': i('.p-mig img').attr('src')}print(product)#運行結(jié)果:圖片爬取失敗# {'index': 0, 'price': '', 'name': '統(tǒng)一京東自營旗艦店', 'commit': '23萬+', 'img': None}# {'index': 1, 'price': '', 'name': '康師傅方便面京東自營官方旗艦店', 'commit': '27萬+', 'img': None}def next_page(page_num):print('-------------------------------正在翻頁-------------------------------')sleep(2)try:#由于京東頁面并不是一次性加載出來,所以可以通過下拉條下拉,模擬瀏覽刷新未展示商品browser.execute_script('window.scrollTo(0, 0.8*document.body.scrollHeight)')sleep(1)#input為底部頁面轉(zhuǎn)換輸入框input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')))#submit為底部頁面轉(zhuǎn)換跳轉(zhuǎn)確定按鈕submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))#清空輸入框input.clear()#輸入框中輸入頁數(shù)input.send_keys(page_num)#確認跳轉(zhuǎn)submit.click()#直到加載出page_num所在頁為高亮,即確認跳轉(zhuǎn)成功wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page_num)))sleep(1)#執(zhí)行函數(shù),獲取跳轉(zhuǎn)頁面的商品信息get_products()except TimeoutError:return next_page(page_num)def main():total = search()# get_products()#搜索商品過后便是商品搜索結(jié)果的第一頁,翻頁無需跳轉(zhuǎn)第一頁,從第二頁開始#int(total)+1,total的值是個字符串,int()轉(zhuǎn)換為數(shù)值,range()管前不管后,所以+1for i in range(2, int(total)+1):next_page(i)if __name__ == '__main__':main()淘寶商品信息
from selenium import webdriver import time from pymysql import * import re #輸入數(shù)據(jù) def search_product(key):#獲得輸入框并輸入要搜索的關(guān)鍵字keydriver.find_element_by_id('q').send_keys(key)#點擊搜索按鈕driver.find_element_by_class_name("btn-search").click()#窗口最大化driver.maximize_window()#休眠15s來登錄淘寶賬號time.sleep(15)#獲得總頁數(shù)allPage = driver.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/div[1]').text#re.findall('(\d+)',allPage)返回一個列表元素allPage = re.findall('(\d+)',allPage)[0]return int(allPage)# 獲取數(shù)據(jù) def get_product():divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]')for div in divs:#獲取元素信息title = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').textprint(title)def main(keyWord):#獲得總共頁數(shù)allPage = search_product(keyWord)currentPage = 1while currentPage <= allPage:print("第{}頁數(shù)據(jù)".format(currentPage))print("*****************************************")driver.get("https://s.taobao.com/search?q={}&s={}".format(keyWord,(currentPage-1)*44))driver.implicitly_wait(2) #瀏覽器等待方法#driver.maximize_window()get_product()print("第{}頁數(shù)據(jù)保存成功".format(currentPage))currentPage += 1print("*****************************************") if __name__ == '__main__':driver = webdriver.Chrome()driver.get("https://s.taobao.com/")main("水果")(2)
import time import pandas as pd import requests from bs4 import BeautifulSoup from selenium import webdriver #瀏覽器等待數(shù)據(jù)加載完成的包 from selenium.webdriver.support.ui import WebDriverWait # 和time差不多 from lxml import etreeurl = 'https://www.taobao.com/?spm=a21bo.2017.201857.1.5af911d9lkTptM' broswer = webdriver.Chrome()# .wait = WebDriverWait(self.broswer,10)broswer.get(url) search_input = broswer.find_element_by_xpath('//*[@id="q"]') search_input.clear() # search_input.send_keys('男士短袖') search_name = input('請輸入商品名稱:') search_input.send_keys(search_name) time.sleep(1)#點擊搜索 search_submit = broswer.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button') search_submit.click()#獲取頁面 page = broswer.page_source html = etree.HTML(page) print(html) div_list = html.xpath('//div[@class="item J_MouserOnverReq "]')for i in div_list:biaoti = i.xpath('.//a[@class="J_ClickStat"]/text()')# 面向?qū)ο?/span> class Taobao:# 構(gòu)造函數(shù)#self類屬性可以在其他類方法中共享def __init__(self):url = 'https://login.taobao.com/member/login.jhtml?spm=a21bo.2017.754894437.1.5af911d99xZVlh&f=top&redirectURL=https%3A%2F%2Fwww.taobao.com%2F'self.url = urlself.broswer = webdriver.Chrome()# 等待10秒self.wait = WebDriverWait(self.broswer,10)#登錄 類方法def log_in(self):''''''# 進入登錄頁面self.broswer.get(self.url)if self.broswer.find_element_by_xpath('//*[@id="fm-login-id"]'):user = self.broswer.find_element_by_xpath('//*[@id="fm-login-id"]')#輸入用戶名user.send_keys('15223820758')time.sleep(5)#輸入密碼password = self.broswer.find_element_by_xpath('//*[@id="fm-login-password"]')password.send_keys('82648264ttgg')time.sleep(5)# 點擊登錄submit = self.broswer.find_element_by_xpath('//*[@id="login-form"]/div[4]/button')submit.click()time.sleep(3)# # 點擊淘寶首頁# taobao_index = self.broswer.find_element_by_xpath('/html/body/div[2]/div/div/div[1]/div/h1/a')# taobao_index.click()# time.sleep(3)# 輸入框search_input = self.broswer.find_element_by_xpath('//*[@id="q"]')search_input.clear()# search_input.send_keys('男士短袖')search_name = input('請輸入商品名稱:')search_input.send_keys(search_name)time.sleep(1)#點擊搜索search_submit = self.broswer.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button')search_submit.click()time.sleep(3)#獲取頁面page = self.broswer.page_sourceprint(page)# html = etree.HTML(page)# print(html)# div_list = html.xpath('//div[@class="item J_MouserOnverReq "]')# for i in div_list:# biaoti = i.xpath('.//a[@class="J_ClickStat"]/text()')#soup = BeautifulSoup(page,'lxml')#soup_data_list = soup.find('div',class_='grid g-clearfix').find_all_next('div',class_='items')# 獲取商品詳情taobao = Taobao() taobao.log_in()詞云圖制作
# 詞云圖 import jieba import pandas as pd import stylecloud import matplotlib.pyplot as plt import numpy as npdf = pd.read_csv('C:\\Users\\Administrator\\Desktop\\京東商品評論1.csv')df.shape #去除重復(fù)值 df.drop_duplicates(inplace=True) #恢復(fù)索引 df.index = range(df.shape[0]) df.shape# import re # df['處理后的評論'] = df['評論'].map(lambda x:re.compile("[^\u4e00-\u9fa5]").sub('', x) ) df['處理后的評論'] = df['評論'].str.extract(r"([\u4e00-\u9fa5]+)") df = df.dropna() #純表情彈幕直接刪除# 機械壓縮去重: def yasuo(st):for i in range(1,int(len(st)/2)+1):for j in range(len(st)):if st[j:j+i] == st[j+i:j+2*i]:k = j + iwhile st[k:k+i] == st[k+i:k+2*i] and k<len(st): k = k + ist = st[:j] + st[k:] return st df["處理后的評論1"] = df["評論"].apply(yasuo)def get_cut_words(content_series):# 讀入停用詞表stop_words = [] with open(r"C:\\Users\\Administrator\\Desktop\\chineseStopWords.txt", 'r') as f:lines = f.readlines()for line in lines:stop_words.append(line.strip())# 添加關(guān)鍵詞my_words = ['5G', 'CPS', '高速公路', '人工智能', '數(shù)字孿生體','工業(yè)大數(shù)據(jù)','智能大數(shù)據(jù)'] for i in my_words:jieba.add_word(i) # 自定義停用詞my_stop_words = ['京東', '鐵棍', '個頭', '...','蘿卜','集團''1', '簽署', '一根', '一個','這次', '首個','巴基斯坦', '印尼', '奉節(jié)', '這是', '國家','馬上','超級','改造'] stop_words.extend(my_stop_words) # 分詞content=';'.join([ str(c) for c in content_series.tolist()])word_num = jieba.lcut(content)# 條件篩選word_num_selected = [i for i in word_num if i not in stop_words and len(i)>=2]return word_num_selected text1 = get_cut_words(content_series=df['處理后的評論1'])sub_new =['規(guī)模小','特產(chǎn)','方便'] sub_old = ['新鮮','感覺','包裝'] text1 = pd.DataFrame(text1) for i,j in zip(sub_old,sub_new):text1.replace(i,j,inplace=True)# 非中文的正則表達式 提出非中文 patternDel = "[^\u4e00-\u9fa5]" filter = text1[0].str.contains(patternDel) text1 = text1[~filter] # python value_counts后生成dataframe word_count = text1.value_counts().rename_axis('unique_values').reset_index(name='counts') word_count.to_excel('info.xls',index=False)text1=text1[0].values.tolist()from stylecloud import gen_stylecloud result = " ".join(text1) gen_stylecloud(text=result,font_path='C:\\Windows\\Fonts\\STKAITI.TTF',# icon_name='fas fa-envira',icon_name='fas fa-carrot', max_words=150,size=2000,# max_font_size=70,output_name='C:\\Users\\Administrator\\Desktop\\t11123.png',) #必須加中文字體,否則格式錯誤情感分析
import paddlehub as hub #這里使用了百度開源的成熟NLP模型來預(yù)測情感傾向 senta = hub.Module(name="senta_bilstm") texts = df['評論'].tolist() input_data = {'text':texts} res = senta.sentiment_classify(data=input_data) df['情感分值'] = [x['positive_probs'] for x in res]#重采樣至15分鐘 df.index = df['發(fā)送時間'] data = df.resample('15min').mean().reset_index()
關(guān)于加圓環(huán)圖中間的圖片操作
總結(jié)
以上是生活随笔為你收集整理的杂记——淘宝/京东 商品/评论爬取+词云图制作的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: [react] 类组件和函数式组件有什么
- 下一篇: 机房计算机配置思维导图,运用思维导图培养