Python爬取京东商品信息以及评论存进MySQL
生活随笔
收集整理的這篇文章主要介紹了
Python爬取京东商品信息以及评论存进MySQL
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
目錄
構(gòu)建mysql數(shù)據(jù)表
第一版:
第二版 :
第三版:
總結(jié):
構(gòu)建mysql數(shù)據(jù)表
問(wèn)題:使用SQL alchemy時(shí),非主鍵不能設(shè)置為自增長(zhǎng),但是我想讓這個(gè)非主鍵僅僅是為了作為索引,autoincrement=True無(wú)效,該怎么實(shí)現(xiàn)讓它自增長(zhǎng)呢?
from sqlalchemy import String,Integer,Text,Column from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import scoped_session from sqlalchemy.ext.declarative import declarative_baseengine=create_engine("mysql+pymysql://root:root@127.0.0.1:3306/jdcrawl?charset=utf8",pool_size=200,max_overflow=300,echo=False )BASE=declarative_base() # 實(shí)例化class Goods(BASE):__tablename__='goods'id=Column(Integer(),primary_key=True,autoincrement=True)sku_id = Column(String(200), primary_key=True, autoincrement=False)name=Column(String(200))price=Column(String(200))comments_num=Column(Integer)shop=Column(String(200))link=Column(String(200))class Comments(BASE):__tablename__='comments'id=Column(Integer(),primary_key=True,autoincrement=True,nullable=False)sku_id=Column(String(200),primary_key=True,autoincrement=False)comments=Column(Text())BASE.metadata.create_all(engine) Session=sessionmaker(engine) sess_db=scoped_session(Session)第一版:
問(wèn)題:爬取幾頁(yè)評(píng)論后就會(huì)爬取到空白頁(yè),添加refer后依舊如此
嘗試解決方法:將獲取評(píng)論地方的線程池改為單線程,并每獲取一頁(yè)評(píng)論增加延時(shí)1s
# 不能爬太快!!!不然獲取不到評(píng)論from bs4 import BeautifulSoup import requests from urllib import parse import csv,json,re import threadpool import time from jd_mysqldb import Goods,Comments,sess_dbheaders={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','Cookie': '__jdv=76161171|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1613711947911; __jdu=16137119479101182770449; areaId=7; ipLoc-djd=7-458-466-0; PCSYCityID=CN_410000_0_0; shshshfpa=07383463-032f-3f99-9d40-639cb57c6e28-1613711950; shshshfpb=u8S9UvxK66gfIbM1mUNrIOg%3D%3D; user-key=153f6b4d-0704-4e56-82b6-8646f3f0dad4; cn=0; shshshfp=9a88944b34cb0ff3631a0a95907b75eb; __jdc=122270672; 3AB9D23F7A4B3C9B=SEELVNXBPU7OAA3UX5JTKR5LQADM5YFJRKY23Z6HDBU4OT2NWYGX525CKFFVHTRDJ7Q5DJRMRZQIQJOW5GVBY43XVI; jwotest_product=99; __jda=122270672.16137119479101182770449.1613711948.1613738165.1613748918.4; JSESSIONID=C06EC8D2E9384D2628AE22B1A6F9F8FC.s1; shshshsID=ab2ca3143928b1b01f6c5b71a15fcebe_5_1613750374847; __jdb=122270672.5.16137119479101182770449|4.1613748918','Referer': 'https://www.jd.com/' }num=0 # 商品數(shù)量 comments_num=0 # 評(píng)論數(shù)量# 獲取商品信息和SkuId def getIndex(url):session=requests.Session()session.headers=headersglobal numres=session.get(url,headers=headers)print(res.status_code)res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')items=soup.select('li.gl-item')for item in items[:3]: # 爬取3個(gè)商品測(cè)試title=item.select_one('.p-name a em').text.strip().replace(' ','')price=item.select_one('.p-price strong').text.strip().replace('¥','')try:shop=item.select_one('.p-shopnum a').text.strip() # 獲取書(shū)籍時(shí)查找店鋪的方法except:shop=item.select_one('.p-shop a').text.strip() # 獲取其他商品時(shí)查找店鋪的方法link=parse.urljoin('https://',item.select_one('.p-img a').get('href'))SkuId=re.search('\d+',link).group()comments_num=getCommentsNum(SkuId,session)print(SkuId,title, price, shop, link, comments_num)print("開(kāi)始存入數(shù)據(jù)庫(kù)...")try:IntoGoods(SkuId,title, price, shop, link, comments_num)except Exception as e:print(e)sess_db.rollback()num += 1print("正在獲取評(píng)論...")# 獲取評(píng)論總頁(yè)數(shù)url1 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page=0&pageSize=10'headers['Referer'] = f'https://item.jd.com/{SkuId}.html'headers['Connection']='keep-alive'res2 = session.get(url1,headers=headers)res2.encoding = res2.apparent_encodingjson_data = json.loads(res2.text)max_page = json_data['maxPage'] # 經(jīng)測(cè)試最多可獲取100頁(yè)評(píng)論,每頁(yè)10條args = []for i in range(0, max_page):# 使用此鏈接獲取評(píng)論得到的為json格式url2 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'# 使用此鏈接獲取評(píng)論得到的非json格式,需要提取# url2_2=f'https://club.jd.com/comment/productPageComments.action?callback=jQuery9287224&productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'args.append(([session,SkuId,url2], None))pool2 = threadpool.ThreadPool(2) # 2個(gè)線程reque2 = threadpool.makeRequests(getComments,args) # 創(chuàng)建任務(wù)for r in reque2:pool2.putRequest(r) # 提交任務(wù)到線程池pool2.wait()# 獲取評(píng)論總數(shù)量 def getCommentsNum(SkuId,sess):headers['Referer']=f'https://item.jd.com/{SkuId}.html'url=f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={SkuId}'res=sess.get(url,headers=headers)try:res.encoding=res.apparent_encodingjson_data=json.loads(res.text) # json格式轉(zhuǎn)為字典num=json_data['CommentsCount'][0]['CommentCount']return numexcept:return 'Error'# 獲取評(píng)論 def getComments(sess,SkuId,url2):global comments_numprint(url2)headers['Referer'] = f'https://item.jd.com/{SkuId}.html'res2 = sess.get(url2,headers=headers)res2.encoding='gbk'json_data=res2.text'''# 如果用url2_2需要進(jìn)行如下操作提取jsonstart = res2.text.find('jQuery9287224(') + len('jQuery9287224(')end = res2.text.find(');')json_data=res2.text[start:end]'''dict_data = json.loads(json_data)try:comments=dict_data['comments']for item in comments:comment=item['content'].replace('\n','')# print(comment)comments_num+=1try:IntoComments(SkuId,comment)except Exception as e:print(e)sess_db.rollback()except:pass# 商品信息入庫(kù) def IntoGoods(SkuId,title, price, shop, link, comments_num):goods_data=Goods(sku_id=SkuId,name=title,price=price,comments_num=comments_num,shop=shop,link=link)sess_db.add(goods_data)sess_db.commit()# 評(píng)論入庫(kù) def IntoComments(SkuId,comment):comments_data=Comments(sku_id=SkuId,comments=comment)sess_db.add(comments_data)sess_db.commit()if __name__ == '__main__':start_time=time.time()urls=[]KEYWORD=parse.quote(input("請(qǐng)輸入要查詢的關(guān)鍵詞:"))for i in range(1,2): # 爬取一頁(yè)進(jìn)行測(cè)試url=f'https://search.jd.com/Search?keyword={KEYWORD}&wq={KEYWORD}&page={i}'urls.append(([url,],None)) # threadpool要求必須這樣寫(xiě)pool=threadpool.ThreadPool(2) # 2個(gè)線程的線程池reque=threadpool.makeRequests(getIndex,urls) # 創(chuàng)建任務(wù)for r in reque:pool.putRequest(r) # 向線程池提交任務(wù)pool.wait() # 等待所有任務(wù)執(zhí)行完畢print("共獲取{}件商品,獲得{}條評(píng)論,耗時(shí){}".format(num,comments_num,time.time()-start_time))第二版 :
經(jīng)測(cè)試,的確不會(huì)出現(xiàn)空白頁(yè)的情況
進(jìn)一步優(yōu)化:同時(shí)獲取2個(gè)以上商品的評(píng)論
# 不能爬太快!!!不然獲取不到評(píng)論 from bs4 import BeautifulSoup import requests from urllib import parse import csv,json,re import threadpool import time from jd_mysqldb import Goods,Comments,sess_dbheaders={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','Cookie': '__jdv=76161171|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1613711947911; __jdu=16137119479101182770449; areaId=7; ipLoc-djd=7-458-466-0; PCSYCityID=CN_410000_0_0; shshshfpa=07383463-032f-3f99-9d40-639cb57c6e28-1613711950; shshshfpb=u8S9UvxK66gfIbM1mUNrIOg%3D%3D; user-key=153f6b4d-0704-4e56-82b6-8646f3f0dad4; cn=0; shshshfp=9a88944b34cb0ff3631a0a95907b75eb; __jdc=122270672; 3AB9D23F7A4B3C9B=SEELVNXBPU7OAA3UX5JTKR5LQADM5YFJRKY23Z6HDBU4OT2NWYGX525CKFFVHTRDJ7Q5DJRMRZQIQJOW5GVBY43XVI; jwotest_product=99; __jda=122270672.16137119479101182770449.1613711948.1613738165.1613748918.4; JSESSIONID=C06EC8D2E9384D2628AE22B1A6F9F8FC.s1; shshshsID=ab2ca3143928b1b01f6c5b71a15fcebe_5_1613750374847; __jdb=122270672.5.16137119479101182770449|4.1613748918','Referer': 'https://www.jd.com/' }num=0 # 商品數(shù)量 comments_num=0 # 評(píng)論數(shù)量# 獲取商品信息和SkuId def getIndex(url):session=requests.Session()session.headers=headersglobal numres=session.get(url,headers=headers)print(res.status_code)res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')items=soup.select('li.gl-item')for item in items[:2]: # 爬取2個(gè)商品測(cè)試title=item.select_one('.p-name a em').text.strip().replace(' ','')price=item.select_one('.p-price strong').text.strip().replace('¥','')try:shop=item.select_one('.p-shopnum a').text.strip() # 獲取書(shū)籍時(shí)查找店鋪的方法except:shop=item.select_one('.p-shop a').text.strip() # 獲取其他商品時(shí)查找店鋪的方法link=parse.urljoin('https://',item.select_one('.p-img a').get('href'))SkuId=re.search('\d+',link).group()headers['Referer'] = f'https://item.jd.com/{SkuId}.html'headers['Connection'] = 'keep-alive'comments_num=getCommentsNum(SkuId,session)print(SkuId,title, price, shop, link, comments_num)print("開(kāi)始將商品存入數(shù)據(jù)庫(kù)...")try:IntoGoods(SkuId,title, price, shop, link, comments_num)except Exception as e:print(e)sess_db.rollback()num += 1print("正在獲取評(píng)論...")# 獲取評(píng)論總頁(yè)數(shù)url1 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page=0&pageSize=10'res2 = session.get(url1,headers=headers)res2.encoding = res2.apparent_encodingjson_data = json.loads(res2.text)max_page = json_data['maxPage'] # 經(jīng)測(cè)試最多可獲取100頁(yè)評(píng)論,每頁(yè)10條print("{}評(píng)論共{}頁(yè)".format(SkuId,max_page))if max_page==0:IntoComments(SkuId,'0')else:for i in range(0, max_page):# 使用此鏈接獲取評(píng)論得到的為json格式url2 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'# 使用此鏈接獲取評(píng)論得到的非json格式,需要提取# url2_2=f'https://club.jd.com/comment/productPageComments.action?callback=jQuery9287224&productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'print("開(kāi)始獲取第{}頁(yè)評(píng)論:{}".format(i+1,url2) )getComments(session,SkuId,url2)time.sleep(1)# 獲取評(píng)論總數(shù)量 def getCommentsNum(SkuId,sess):url=f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={SkuId}'res=sess.get(url)try:res.encoding=res.apparent_encodingjson_data=json.loads(res.text) # json格式轉(zhuǎn)為字典num=json_data['CommentsCount'][0]['CommentCount']return numexcept:return 'Error'# 獲取評(píng)論 def getComments(sess,SkuId,url2):global comments_numres2 = sess.get(url2)res2.encoding=res2.apparent_encodingjson_data=res2.text'''# 如果用url2_2需要進(jìn)行如下操作提取jsonstart = res2.text.find('jQuery9287224(') + len('jQuery9287224(')end = res2.text.find(');')json_data=res2.text[start:end]'''dict_data = json.loads(json_data)comments=dict_data['comments']for item in comments:comment=item['content'].replace('\n','')# print(comment)comments_num+=1try:IntoComments(SkuId,comment)except Exception as e:print(e)sess_db.rollback()# 商品信息入庫(kù) def IntoGoods(SkuId,title, price, shop, link, comments_num):goods_data=Goods(sku_id=SkuId,name=title,price=price,comments_num=comments_num,shop=shop,link=link)sess_db.add(goods_data)sess_db.commit()# 評(píng)論入庫(kù) def IntoComments(SkuId,comment):comments_data=Comments(sku_id=SkuId,comments=comment)sess_db.add(comments_data)sess_db.commit()if __name__ == '__main__':start_time=time.time()urls=[]KEYWORD=parse.quote(input("請(qǐng)輸入要查詢的關(guān)鍵詞:"))for i in range(1,2): # 爬取一頁(yè)進(jìn)行測(cè)試url=f'https://search.jd.com/Search?keyword={KEYWORD}&wq={KEYWORD}&page={i}'urls.append(([url,],None)) # threadpool要求必須這樣寫(xiě)pool=threadpool.ThreadPool(2) # 2個(gè)線程的線程池reque=threadpool.makeRequests(getIndex,urls) # 創(chuàng)建任務(wù)for r in reque:pool.putRequest(r) # 向線程池提交任務(wù)pool.wait() # 等待所有任務(wù)執(zhí)行完畢print("共獲取{}件商品,獲得{}條評(píng)論,耗時(shí){}".format(num,comments_num,time.time()-start_time))第三版:
?。。。。不行,又出現(xiàn)空白頁(yè)了
# 不能爬太快!!!不然獲取不到評(píng)論 from bs4 import BeautifulSoup import requests from urllib import parse import csv,json,re import threadpool import time from jd_mysqldb import Goods,Comments,sess_dbheaders={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','Cookie': '__jdv=76161171|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1613711947911; __jdu=16137119479101182770449; areaId=7; ipLoc-djd=7-458-466-0; PCSYCityID=CN_410000_0_0; shshshfpa=07383463-032f-3f99-9d40-639cb57c6e28-1613711950; shshshfpb=u8S9UvxK66gfIbM1mUNrIOg%3D%3D; user-key=153f6b4d-0704-4e56-82b6-8646f3f0dad4; cn=0; shshshfp=9a88944b34cb0ff3631a0a95907b75eb; __jdc=122270672; 3AB9D23F7A4B3C9B=SEELVNXBPU7OAA3UX5JTKR5LQADM5YFJRKY23Z6HDBU4OT2NWYGX525CKFFVHTRDJ7Q5DJRMRZQIQJOW5GVBY43XVI; jwotest_product=99; __jda=122270672.16137119479101182770449.1613711948.1613738165.1613748918.4; JSESSIONID=C06EC8D2E9384D2628AE22B1A6F9F8FC.s1; shshshsID=ab2ca3143928b1b01f6c5b71a15fcebe_5_1613750374847; __jdb=122270672.5.16137119479101182770449|4.1613748918','Referer': 'https://www.jd.com/' }num=0 # 商品數(shù)量 comments_num=0 # 評(píng)論數(shù)量# 獲取商品信息和SkuId def getIndex(url):global numskuids=[]session=requests.Session()session.headers=headersres=session.get(url,headers=headers)print(res.status_code)res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')items=soup.select('li.gl-item')for item in items[:3]: # 爬取3個(gè)商品測(cè)試title=item.select_one('.p-name a em').text.strip().replace(' ','')price=item.select_one('.p-price strong').text.strip().replace('¥','')try:shop=item.select_one('.p-shopnum a').text.strip() # 獲取書(shū)籍時(shí)查找店鋪的方法except:shop=item.select_one('.p-shop a').text.strip() # 獲取其他商品時(shí)查找店鋪的方法link=parse.urljoin('https://',item.select_one('.p-img a').get('href'))SkuId=re.search('\d+',link).group()skuids.append(([SkuId,session],None))headers['Referer'] = f'https://item.jd.com/{SkuId}.html'headers['Connection'] = 'keep-alive'comments_num=getCommentsNum(SkuId,session) # 評(píng)論數(shù)量print(SkuId,title, price, shop, link, comments_num)print("開(kāi)始將商品存入數(shù)據(jù)庫(kù)...")try:IntoGoods(SkuId,title, price, shop, link, comments_num)except Exception as e:print(e)sess_db.rollback()num += 1print("開(kāi)始獲取評(píng)論并存入數(shù)據(jù)庫(kù)...")pool2=threadpool.ThreadPool(3) # 可同時(shí)獲取3個(gè)商品的評(píng)論task=threadpool.makeRequests(getComments,skuids)for r in task:pool2.putRequest(r)pool2.wait()# 獲取評(píng)論 def getComments(SkuId,sess):# 獲取評(píng)論總頁(yè)數(shù)url1 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page=0&pageSize=10'res2 = sess.get(url1, headers=headers)res2.encoding = res2.apparent_encodingjson_data = json.loads(res2.text)max_page = json_data['maxPage'] # 經(jīng)測(cè)試最多可獲取100頁(yè)評(píng)論,每頁(yè)10條print("{}評(píng)論共{}頁(yè)".format(SkuId, max_page))if max_page == 0:IntoComments(SkuId, '0')else:for i in range(0, max_page):# 使用此鏈接獲取評(píng)論得到的為json格式url2 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'# 使用此鏈接獲取評(píng)論得到的非json格式,需要提取# url2_2=f'https://club.jd.com/comment/productPageComments.action?callback=jQuery9287224&productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'print("開(kāi)始獲取第{}頁(yè)評(píng)論:{}".format(i + 1, url2))getComments_one(sess, SkuId, url2)time.sleep(1)# 獲取評(píng)論總數(shù)量 def getCommentsNum(SkuId,sess):url=f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={SkuId}'res=sess.get(url)try:res.encoding=res.apparent_encodingjson_data=json.loads(res.text) # json格式轉(zhuǎn)為字典num=json_data['CommentsCount'][0]['CommentCount']return numexcept:return 'Error'# 獲取單個(gè)評(píng)論 def getComments_one(sess,SkuId,url2):global comments_numres2 = sess.get(url2)res2.encoding=res2.apparent_encodingjson_data=res2.text'''# 如果用url2_2需要進(jìn)行如下操作提取jsonstart = res2.text.find('jQuery9287224(') + len('jQuery9287224(')end = res2.text.find(');')json_data=res2.text[start:end]'''dict_data = json.loads(json_data)comments=dict_data['comments']for item in comments:comment=item['content'].replace('\n','')# print(comment)comments_num+=1try:IntoComments(SkuId,comment)except Exception as e:print(e)print("rollback!")sess_db.rollback()# 商品信息入庫(kù) def IntoGoods(SkuId,title, price, shop, link, comments_num):goods_data=Goods(sku_id=SkuId,name=title,price=price,comments_num=comments_num,shop=shop,link=link)sess_db.add(goods_data)sess_db.commit()# 評(píng)論入庫(kù) def IntoComments(SkuId,comment):comments_data=Comments(sku_id=SkuId,comments=comment)sess_db.add(comments_data)sess_db.commit()if __name__ == '__main__':start_time=time.time()urls=[]KEYWORD=parse.quote(input("請(qǐng)輸入要查詢的關(guān)鍵詞:"))for i in range(1,2): # 爬取一頁(yè)進(jìn)行測(cè)試url=f'https://search.jd.com/Search?keyword={KEYWORD}&wq={KEYWORD}&page={i}'urls.append(([url,],None)) # threadpool要求必須這樣寫(xiě)pool=threadpool.ThreadPool(2) # 2個(gè)線程的線程池reque=threadpool.makeRequests(getIndex,urls) # 創(chuàng)建任務(wù)for r in reque:pool.putRequest(r) # 向線程池提交任務(wù)pool.wait() # 等待所有任務(wù)執(zhí)行完畢print("共獲取{}件商品,獲得{}條評(píng)論,耗時(shí){}".format(num,comments_num,time.time()-start_time))總結(jié):
京東的反爬有點(diǎn)強(qiáng),如果不想爬取到空白頁(yè),只能用單線程加延時(shí)一條一條的爬
?
總結(jié)
以上是生活随笔為你收集整理的Python爬取京东商品信息以及评论存进MySQL的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: Python爬取房天下租房信息实战
- 下一篇: HTTP14种常见状态码详解——来自《h