【爬虫】案例(爬取豆瓣top250)[完整+详细]
生活随笔
收集整理的這篇文章主要介紹了
【爬虫】案例(爬取豆瓣top250)[完整+详细]
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
文章目錄
- 1、需求
- 2、流程
- 3、實現
- 導包
- 主函數
- 數據獲取
- 數據解析
- 存儲到excel
- 簡易數據庫
- 存儲到數據庫
- 5、完整代碼
- 6、小注
1、需求
爬取豆瓣top250的電影的播放鏈接,封面鏈接,中外文名等數據,并作數據的持久化處理(存放到excel和數據庫)
2、流程
使用requests庫獲取網頁數據,使用bs4和re對網頁進行解析和文字匹配
最后使用xlwt和pymysql將數據存入excel和數據庫
3、實現
導包
# -*- coding: utf-8 -* from bs4 import BeautifulSoup # 網頁解析,獲取數據 import re # 正則,文字匹配 import requests # 獲取網頁數據 import xlwt # excel操作 import pymysql.cursors # 數據庫操作主函數
def main():baseUrl = "https://movie.douban.com/top250?start="# 爬取網頁dataList = getDate(baseUrl)# 保存數據savePath = "豆瓣top250.xls"saveData(savePath, dataList)saveDataToDb(dataList)數據獲取
# 得到指定URL的網頁內容 def askUrl(url):headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}html = ""try:r = requests.get(url=url, headers=headers, timeout=3)r.encoding = 'utf-8'html = r.textexcept Exception as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)return html數據解析
def getDate(baseUrl):dataList = []for i in range(0, 10):url = baseUrl + str(i*25)html = askUrl(url)# 解析數據soup = BeautifulSoup(html, "html.parser")for item in soup.find_all("div", class_="item"):data = [] # 存放一部電影的所有信息item = str(item)link = re.findall(r'<a href="(.*)">', item)[0] # 鏈接data.append(link)image = re.findall(r'<img.*src="(.*)" .*/>', item)[0] # 圖片data.append(image)titles = re.findall(r'<span class="title">(.*)</span>', item) # 片名data.append(titles[0]) # 添加中文名if len(titles) == 2: # 添加外國名data.append(titles[1].replace("\\", ""))else:data.append(" ")rate = re.findall(r'<span class="rating_num".*>(.*)</span>', item)[0] # 評分data.append(rate)judge = re.findall(r'<span>(\d*)人評價</span>', item)[0] # 評級人數data.append(judge)inq = re.findall(r'<span class="inq">(.*)</span>', item, re.S) # 簡述if len(inq) != 0:inq = inq[0].replace("。", "")data.append(inq)else:data.append("")bd = re.findall(r'<p class="">(.*?)</p>', item, re.S)[0] # 其他信息bd = re.sub('<br/>', " ", bd)bd = re.sub("/", " ", bd)bd = re.sub("\\n", " ", bd)bd = re.sub(r"\xa0", " ", bd)data.append(bd.strip())dataList.append(data)return dataList存儲到excel
def saveData(savePath, dataList):workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)worksheet = workbook.add_sheet("豆瓣top250", cell_overwrite_ok=True)col = ("電影詳情鏈接", "圖片鏈接", "影片中文名", "影片英文名", "評分", "評價數", "概況", "相關信息")for i in range(0, 8):worksheet.write(0, i, col[i])for i in range(0, 250):data = dataList[i]for j in range(0, 8):worksheet.write(i+1, j, data[j])workbook.save(savePath)簡易數據庫
DROP TABLE IF EXISTS `top250`; CREATE TABLE `top250` (`id` int(11) NOT NULL AUTO_INCREMENT,`link` varchar(255) DEFAULT NULL,`image` varchar(255) DEFAULT NULL,`cname` varchar(255) DEFAULT NULL,`oname` varchar(255) DEFAULT NULL,`rate` varchar(255) DEFAULT NULL,`judge` varchar(255) DEFAULT NULL,`inq` varchar(255) DEFAULT NULL,PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=517 DEFAULT CHARSET=utf8;存儲到數據庫
def saveDataToDb(dataList):for i in range(0, len(dataList)):data = dataList[i]print(data)# 連接數據庫connect = pymysql.Connect(host='localhost',port=3306,user='root',passwd='',db='douban',charset='utf8')# 獲取游標cursor = connect.cursor()sql = "INSERT INTO top250 (link, image, cname, oname, rate, judge, inq) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s')"data = (data[0], data[1], data[2], data[3], data[4], data[5], data[6])cursor.execute(sql % data)connect.commit()5、完整代碼
# -*- coding: utf-8 -* from bs4 import BeautifulSoup # 網頁解析,獲取數據 import re # 正則,文字匹配 import requests # 獲取網頁數據 import xlwt # excel操作def main():baseUrl = "https://movie.douban.com/top250?start="# 爬取網頁dataList = getDate(baseUrl)# 保存數據savePath = "豆瓣top250.xls"saveData(savePath, dataList)# saveDataToDb(dataList)# 得到指定URL的網頁內容 def askUrl(url):headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}html = ""try:r = requests.get(url=url, headers=headers, timeout=3)r.encoding = 'utf-8'html = r.textexcept Exception as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)return htmldef getDate(baseUrl):dataList = []for i in range(0, 10):url = baseUrl + str(i*25)html = askUrl(url)# 解析數據soup = BeautifulSoup(html, "html.parser")for item in soup.find_all("div", class_="item"):data = [] # 存放一部電影的所有信息item = str(item)link = re.findall(r'<a href="(.*)">', item)[0] # 鏈接data.append(link)image = re.findall(r'<img.*src="(.*)" .*/>', item)[0] # 圖片data.append(image)titles = re.findall(r'<span class="title">(.*)</span>', item) # 片名data.append(titles[0]) # 添加中文名if len(titles) == 2: # 添加外國名data.append(titles[1].replace("\\", ""))else:data.append(" ")rate = re.findall(r'<span class="rating_num".*>(.*)</span>', item)[0] # 評分data.append(rate)judge = re.findall(r'<span>(\d*)人評價</span>', item)[0] # 評級人數data.append(judge)inq = re.findall(r'<span class="inq">(.*)</span>', item, re.S) # 簡述if len(inq) != 0:inq = inq[0].replace("。", "")data.append(inq)else:data.append("")bd = re.findall(r'<p class="">(.*?)</p>', item, re.S)[0] # 其他信息bd = re.sub('<br/>', " ", bd)bd = re.sub("/", " ", bd)bd = re.sub("\\n", " ", bd)bd = re.sub(r"\xa0", " ", bd)data.append(bd.strip())dataList.append(data)return dataListdef saveData(savePath, dataList):workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)worksheet = workbook.add_sheet("豆瓣top250", cell_overwrite_ok=True)col = ("電影詳情鏈接", "圖片鏈接", "影片中文名", "影片英文名", "評分", "評價數", "概況", "相關信息")for i in range(0, 8):worksheet.write(0, i, col[i])for i in range(0, 250):data = dataList[i]for j in range(0, 8):worksheet.write(i+1, j, data[j])workbook.save(savePath)if (__name__ == "__main__"):main()6、小注
python操作數據庫的知識
python操作數據庫的知識
b站鏈接
https://www.bilibili.com/video/BV12E411A7ZQ
總結
以上是生活随笔為你收集整理的【爬虫】案例(爬取豆瓣top250)[完整+详细]的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: AI的发展:在金融行业的应用与机遇(附视
- 下一篇: dubbo项目搭建