用Python3解析html的几种操作方式,你都会用吗?
解析html是爬蟲(chóng)后的重要的一個(gè)處理數(shù)據(jù)的環(huán)節(jié)。一下記錄解析html的幾種方式。
先介紹基礎(chǔ)的輔助函數(shù),主要用于獲取html并輸入解析后的結(jié)束
#把傳遞解析函數(shù),便于下面的修改 def get_html(url, paraser=bs4_paraser):headers = {'Accept': '*/*','Accept-Encoding': 'gzip, deflate, sdch','Accept-Language': 'zh-CN,zh;q=0.8','Host': 'www.360kan.com','Proxy-Connection': 'keep-alive','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}request = urllib2.Request(url, headers=headers)response = urllib2.urlopen(request)response.encoding = 'utf-8'if response.code == 200:data = StringIO.StringIO(response.read())gzipper = gzip.GzipFile(fileobj=data)data = gzipper.read()value = paraser(data) # open('E:/h5/haPkY0osd0r5UB.html').read()return valueelse:passvalue = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser) for row in value:print (row)1.lxml.html的方式進(jìn)行解析
The lxml XML toolkit is a Pythonic binding for the C libraries libxml2
and libxslt. It is unique in that it combines the speed and XML
feature completeness of these libraries with the simplicity of a
native Python API, mostly compatible but superior to the well-known
ElementTree API. The latest release works with all CPython versions
from 2.6 to 3.5. See the introduction for more information about
background and goals of the lxml project. Some common questions are
answered in the FAQ. 官網(wǎng)
2.使用BeautifulSoup,基本過(guò)時(shí)了,多的不說(shuō)了,大家網(wǎng)上找資料看看
def bs4_paraser(html):all_value = []value = {}soup = BeautifulSoup(html, 'html.parser')# 獲取影評(píng)的部分all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)for row in all_div:# 獲取每一個(gè)影評(píng),即影評(píng)的itemall_div_item = row.find_all('div', attrs={'class': 'item'})for r in all_div_item:# 獲取影評(píng)的標(biāo)題部分title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)if title is not None and len(title) > 0:value['title'] = title[0].a.stringvalue['title_href'] = title[0].a['href']score_text = title[0].div.span.span['style']score_text = re.search(r'\d+', score_text).group()value['score'] = int(score_text) / 20# 時(shí)間value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string# 多少人喜歡value['people'] = int(re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())# print rall_value.append(value)value = {}return all_value3.使用SGMLParser,主要是通過(guò)start、end tag的方式進(jìn)行了,解析工程比較明朗,但是有點(diǎn)麻煩,而且該案例的場(chǎng)景不太適合該方法。
''' 學(xué)習(xí)中遇到問(wèn)題沒(méi)人解答?小編創(chuàng)建了一個(gè)Python學(xué)習(xí)交流QQ群:531509025 尋找有志同道合的小伙伴,互幫互助,群里還有不錯(cuò)的視頻學(xué)習(xí)教程和PDF電子書(shū)! ''' class CommentParaser(SGMLParser):def __init__(self):SGMLParser.__init__(self)self.__start_div_yingping = Falseself.__start_div_item = Falseself.__start_div_gclear = Falseself.__start_div_ratingwrap = Falseself.__start_div_num = False# aself.__start_a = False# span 3中狀態(tài)self.__span_state = 0# 數(shù)據(jù)self.__value = {}self.data = []def start_div(self, attrs):for k, v in attrs:if k == 'class' and v == 'yingping-list-wrap':self.__start_div_yingping = Trueelif k == 'class' and v == 'item':self.__start_div_item = Trueelif k == 'class' and v == 'g-clear title-wrap':self.__start_div_gclear = Trueelif k == 'class' and v == 'rating-wrap g-clear':self.__start_div_ratingwrap = Trueelif k == 'class' and v == 'num':self.__start_div_num = Truedef end_div(self):if self.__start_div_yingping:if self.__start_div_item:if self.__start_div_gclear:if self.__start_div_num or self.__start_div_ratingwrap:if self.__start_div_num:self.__start_div_num = Falseif self.__start_div_ratingwrap:self.__start_div_ratingwrap = Falseelse:self.__start_div_gclear = Falseelse:self.data.append(self.__value)self.__value = {}self.__start_div_item = Falseelse:self.__start_div_yingping = Falsedef start_a(self, attrs):if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:self.__start_a = Truefor k, v in attrs:if k == 'href':self.__value['href'] = vdef end_a(self):if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:self.__start_a = Falsedef start_span(self, attrs):if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:if self.__start_div_ratingwrap:if self.__span_state != 1:for k, v in attrs:if k == 'class' and v == 'rating':self.__span_state = 1elif k == 'class' and v == 'time':self.__span_state = 2else:for k, v in attrs:if k == 'style':score_text = re.search(r'\d+', v).group()self.__value['score'] = int(score_text) / 20self.__span_state = 3elif self.__start_div_num:self.__span_state = 4def end_span(self):self.__span_state = 0def handle_data(self, data):if self.__start_a:self.__value['title'] = dataelif self.__span_state == 2:self.__value['time'] = dataelif self.__span_state == 4:score_text = re.search(r'\d+', data).group()self.__value['people'] = int(score_text)pass def sgl_parser(html):parser = CommentParaser()parser.feed(html)return parser.data4.HTMLParaer,與3原理相識(shí),就是調(diào)用的方法不太一樣,基本上可以公用
class CommentHTMLParser(HTMLParser.HTMLParser):def __init__(self):HTMLParser.HTMLParser.__init__(self)self.__start_div_yingping = Falseself.__start_div_item = Falseself.__start_div_gclear = Falseself.__start_div_ratingwrap = Falseself.__start_div_num = False# aself.__start_a = False# span 3中狀態(tài)self.__span_state = 0# 數(shù)據(jù)self.__value = {}self.data = []def handle_starttag(self, tag, attrs):if tag == 'div':for k, v in attrs:if k == 'class' and v == 'yingping-list-wrap':self.__start_div_yingping = Trueelif k == 'class' and v == 'item':self.__start_div_item = Trueelif k == 'class' and v == 'g-clear title-wrap':self.__start_div_gclear = Trueelif k == 'class' and v == 'rating-wrap g-clear':self.__start_div_ratingwrap = Trueelif k == 'class' and v == 'num':self.__start_div_num = Trueelif tag == 'a':if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:self.__start_a = Truefor k, v in attrs:if k == 'href':self.__value['href'] = velif tag == 'span':if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:if self.__start_div_ratingwrap:if self.__span_state != 1:for k, v in attrs:if k == 'class' and v == 'rating':self.__span_state = 1elif k == 'class' and v == 'time':self.__span_state = 2else:for k, v in attrs:if k == 'style':score_text = re.search(r'\d+', v).group()self.__value['score'] = int(score_text) / 20self.__span_state = 3elif self.__start_div_num:self.__span_state = 4def handle_endtag(self, tag):if tag == 'div':if self.__start_div_yingping:if self.__start_div_item:if self.__start_div_gclear:if self.__start_div_num or self.__start_div_ratingwrap:if self.__start_div_num:self.__start_div_num = Falseif self.__start_div_ratingwrap:self.__start_div_ratingwrap = Falseelse:self.__start_div_gclear = Falseelse:self.data.append(self.__value)self.__value = {}self.__start_div_item = Falseelse:self.__start_div_yingping = Falseelif tag == 'a':if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:self.__start_a = Falseelif tag == 'span':self.__span_state = 0def handle_data(self, data):if self.__start_a:self.__value['title'] = dataelif self.__span_state == 2:self.__value['time'] = dataelif self.__span_state == 4:score_text = re.search(r'\d+', data).group()self.__value['people'] = int(score_text)pass def html_parser(html):parser = CommentHTMLParser()parser.feed(html)return parser.data3,4對(duì)于該案例來(lái)說(shuō)確實(shí)是不太適合,趁現(xiàn)在有空記錄下來(lái),功學(xué)習(xí)使用!
以上這篇對(duì)Python3 解析html的幾種操作方式小結(jié)就是小編分享給大家的全部?jī)?nèi)容了,希望能給大家一個(gè)參考。
總結(jié)
以上是生活随笔為你收集整理的用Python3解析html的几种操作方式,你都会用吗?的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: 学习Python一定要知道的在定义变量中
- 下一篇: python 把if 写在一行的两种方式