Python-爬取2345电影并写入文件
1.目標:爬取2345電影網(wǎng)2017年最新電影
2.所使用的庫:
測試環(huán)境 Python 3.6.0
3.目標鏈接
http://dianying.2345.com/list/—-2017—2.html
點擊下一頁觀察每個url變化規(guī)律
4.開發(fā)者工具觀察
所有電影內(nèi)容都在{‘class’:’v_picConBox mt15’}這個div里,div里面是ul,ul下又包含所有電影內(nèi)容的li標簽
(1)先寫第一個函數(shù),首先獲取整個頁面
def getHTMLText(self,url):try:r = requests.get(url, timeout=30)r.raise_for_status()r.encoding = r.apparent_encodingreturn r.textexcept:return ""(2)然后計算所有頁面頁數(shù),這時可以使用檢查元素
def getPages(self):html = self.getHTMLText(self.urlBase)soup = BeautifulSoup(html,'lxml')# tag = soup.find('div',attrs={'class':'v_picConBox mt15'})tag = soup.find('div',attrs={'class':'v_page'})subTags = tag.find_all('a')#獲取頁碼數(shù)return int(subTags[-2].get_text())(3)用于拼接url的函數(shù)
pages由上面的getPages()函數(shù)獲得
(4)爬取頁面函數(shù),包含三項內(nèi)容movieName/ movieScore/ movieStaring
這里之前定義了一個類
/—————————————————————————————————-/
(5)最后一個函數(shù)save()
這里導入了codecs模塊,這個模塊可以選擇輸入字符的編碼.之前的程序在寫入txt時都需要將字符串的編碼轉(zhuǎn)換成utf8,這里只需要用
codecs.open(filename,'w','utf8')
打開文件就行了.后面往句柄中輸入的字符串都會自動保存為utf8的編碼.
def save(self,items):count =0fileName = '2017熱門電影.txt'.encode('GBK')#格式化,這里有兩種實現(xiàn)方式tplt = "{0:^10}\t{1:<10}\t{2:^10}"#使用了之前導入的codesc庫,修改編碼with codecs.open(fileName,'w','utf-8') as fp:#items是已經(jīng)存儲好爬取內(nèi)容的列表for item in items:# fp.write('%s \t %s \t %s \r\n' %(item.movieName,item.movieScore,item.movieStaring))# tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"fp.write(tplt.format(item.movieName,item.movieScore,item.movieStaring))count = count + 1print('\r當前進度:{:.2f}%'.format(count*100/len(tags),end=''))完整代碼
from bs4 import BeautifulSoup import requests import codecsclass MovieItem(object):movieName = NonemovieScore = NonemovieStaring = Noneclass GetMovie(object):def __init__(self):self.urlBase = 'http://dianying.2345.com/list/----2017--.html'self.pages = self.getPages()self.urls = [] #存放拼接后的urlself.items = []self.getUrls(self.pages)self.spider(self.urls)self.save(self.items)def getHTMLText(self,url):try:r = requests.get(url, timeout=30)r.raise_for_status()r.encoding = r.apparent_encodingreturn r.textexcept:return ""def getPages(self):html = self.getHTMLText(self.urlBase)soup = BeautifulSoup(html,'lxml')# tag = soup.find('div',attrs={'class':'v_picConBox mt15'})tag = soup.find('div',attrs={'class':'v_page'})subTags = tag.find_all('a')#獲取頁碼數(shù)return int(subTags[-2].get_text())def getUrls(self,pages):urlHead = 'http://dianying.2345.com/list/----2017---'urlEnd = '.html'for i in range(1,pages+1):url = urlHead + str(i) +urlEndself.urls.append(url)def spider(self,urls):for url in urls:htmlContent = self.getHTMLText(url)soup = BeautifulSoup(htmlContent,'lxml')anchorTag = soup.find('ul',attrs={'class':'v_picTxt pic180_240 clearfix'})# print(anchorTag)tags = anchorTag.find_all('li')for tag in tags:item = MovieItem()item.movieName = tag.find('span',attrs ={'class':'sTit'}).getText()item.movieScore = tag.find('span',attrs={'class':'pRightBottom'}).em.get_text().replace('分:','')item.movieStaring = tag.find('span',attrs={'class':'sDes'}).get_text().replace('主演:','')self.items.append(item)def save(self,items):count =0fileName = '2017熱門電影.txt'.encode('GBK')tplt = "{0:^10}\t{1:<10}\t{2:^10}"with codecs.open(fileName,'w','utf-8') as fp:for item in items:# fp.write('%s \t %s \t %s \r\n' %(item.movieName,item.movieScore,item.movieStaring))# tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"fp.write(tplt.format(item.movieName,item.movieScore,item.movieStaring))#這里處理的不好,以后填坑count = count + 1print('\r當前進度:{:.2f}%'.format(count*100/len(items),end=''))if __name__ == '__main__':GM = GetMovie()去廣告修正版
from bs4 import BeautifulSoup import requests import codecsclass MovieItem(object):movieName = NonemovieScore = NonemovieStaring = Noneclass GetMovie(object):def __init__(self):self.urlBase = 'http://dianying.2345.com/list/----2018--.html'self.pages = self.getPages()self.urls = [] #存放拼接后的urlself.items = []self.getUrls(self.pages)self.spider(self.urls)self.save(self.items)def getHTMLText(self,url):try:r = requests.get(url, timeout=30)r.raise_for_status()r.encoding = r.apparent_encodingreturn r.textexcept:return ""def getPages(self):html = self.getHTMLText(self.urlBase)soup = BeautifulSoup(html,'lxml')# tag = soup.find('div',attrs={'class':'v_picConBox mt15'})tag = soup.find('div',attrs={'class':'v_page'})subTags = tag.find_all('a')#獲取頁碼數(shù)# print(subTags)# print("aaaa",int(subTags[-2].get_text()))return int(subTags[-2].get_text())def getUrls(self,pages):urlHead = 'http://dianying.2345.com/list/----2018---'urlEnd = '.html'for i in range(1,pages+1):url = urlHead + str(i) +urlEndself.urls.append(url)def spider(self,urls):for url in urls:htmlContent = self.getHTMLText(url)soup = BeautifulSoup(htmlContent,'lxml')anchorTag = soup.find('ul',attrs={'class':'v_picTxt pic180_240 clearfix'})# print(anchorTag)tags = anchorTag.find_all('li')tags.pop(9)# print(tags[9])for tag in tags:try:item = MovieItem()item.movieName = tag.find('span',attrs ={'class':'sTit'}).get_text().strip()item.movieScore = tag.find('span',attrs={'class':'pRightBottom'}).em.get_text().replace('分:','')item.movieStaring = tag.find('span',attrs={'class':'sDes'}).get_text().replace('主演:','')# print(item.movieName,item.movieScore,item.movieStaring)self.items.append(item)except Exception as e:raise edef save(self,items):count =0fileName = '2018熱門電影.txt'tplt = "{0:^10}\t{1:^10}\t{2:^10}"# tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"# for item in items:# # print((tplt.format(item.movieName,item.movieScore,item.movieStaring,chr(12288))))# print((tplt.format(item.movieName,item.movieScore,item.movieStaring)))with codecs.open(fileName,'w','utf-8') as fp:for item in items:fp.write(tplt.format(item.movieName,item.movieScore,item.movieStaring)+'\n')count = count + 1print('\r當前進度:{:.2f}%'.format(count*100/len(items),end=''))if __name__ == '__main__':GM = GetMovie()總結(jié)
以上是生活随笔為你收集整理的Python-爬取2345电影并写入文件的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Sublime3 搭建C/C++环境
- 下一篇: Python -bs4反爬虫解决方法