python网页爬虫例子_Python 利用Python编写简单网络爬虫实例3
by:授客 QQ:1033553122
實驗環境
python版本:3.3.5(2.7下報錯
實驗目的
獲取目標網站“http://bbs.51testing.com/forum.php”中特定url,通過分析發現,目標url同其它url的關系如下
目標url存在子頁面中的文章中,隨機分布,我們要把它找出來
python腳本
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from urllib.request import *
import gzip, re
from io import BytesIO
from html.parser import HTMLParser
# 爬蟲類
class Reptile:
"""to download web pages"""
def __init__(self):
self.url_set = set() # 用于存儲已下載過的頁面url
self.data = ""
# 下載頁面
def get_page(self, url, headers):
request = Request(url, headers=headers)
request.add_header('Accept-encoding', 'gzip') #下載經過gzip方式壓縮后的網頁,減少網絡流量
try:
response = urlopen(request) # 發送請求報文
if response.code == 200: # 請求成功
page = response.read() # 讀取經壓縮后的頁面
if response.info().get("Content-Encoding") == "gzip":
page_data = BytesIO(page)
gzipper = gzip.GzipFile(fileobj = page_data)
self.data = gzipper.read()
else:
print("gzip unused")
self.data = page_data # 網頁未采用gzip方式壓縮,使用原頁面
except Exception:
pass
self.url_set.add(url)
return self.data
# 獲取論壇目標版塊url
def get_forum_url(self, url_set, home, include):
forum_url_set = set() # 用于存放版塊url
while len(url_set) > 0:
url = url_set.pop()
if re.findall(include, url):
# 讀取的版塊url通常是forum-53-1.html形勢的
url = home + url
forum_url_set.add(url)
return forum_url_set
# 獲取版塊url下的帖子url
def get_title_url(self, url_set, home, include):
title_url_set = set() # 用于存放帖子url
while len(url_set) > 0:
url = url_set.pop()
if re.findall(include, url):
# 讀取的帖子url通常是thread-1044711-1-1.html形式的
url = home + url
title_url_set.add(url)
return title_url_set
# 解析器類
class MyHtmlParser(HTMLParser):
def reset(self):
HTMLParser.reset(self) # 注意順序
self.url_set = set()
def handle_starttag(self, tag, attrs):
#self.url = []
url_list = [value for key, value in attrs if "href" == key]
if url_list:
for url in url_list:
self.url_set.add(url)
##############測試################
# 添加頭域,偽裝瀏覽器訪問網站,防止一些網站拒絕爬蟲訪問
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0"}
init_url = "http://bbs.51testing.com/forum.php"
# 構造解析器
parser = MyHtmlParser(strict = False)
# 下載網頁
page_number = 1
print("program is downloading the frist url page")
reptile = Reptile()
page = reptile.get_page(init_url, headers)
print("processing the %dth url page" % page_number)
# 解析網頁(獲取url)
parser.feed(str(page))
# 獲取分類版塊url
home = "http://bbs.51testing.com/"
# 分成多個變量來寫,主要是為了書寫方便,排版友好
pattern1 = "forum-122-[1-9]|forum-243-[1-9]|forum-40-[1-9]|forum-63-[1-9]"
pattern2 = "|forum-42-[1-9]|forum-53-[1-9]|forum-275-[1-9]|forum-140-[-9]"
pattern3 = "|forum-138-[1-9]|forum-139-[1-9]|forum-141-[1-9]"
pattern = pattern1 + pattern2 + pattern3
include = re.compile(pattern)
forum_url_set = reptile.get_forum_url(parser.url_set, home, include)
# 循環,獲取每個分類版塊下,1-10子版塊的url(前10頁)(翻頁頁面)
result_url_set = set()
forum_index = 1
for forum_url in forum_url_set:
page = reptile.get_page(forum_url, headers)
parser.feed(str(page))
print("getting the board urls in the %dth forum page" % forum_index)
tmp_url_set = reptile.get_forum_url(parser.url_set, home, include)
forum_index = forum_index + 1
result_url_set = result_url_set ^ tmp_url_set
title_url_set = set()
forum_index = 1
title_index = 1
for forum_url in result_url_set:
page = reptile.get_page(forum_url, headers)
parser.feed(str(page))
# 獲取版塊下的帖子url
pattern1 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{1}[.]html|"
pattern2 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{1}[.]html|"
pattern3 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{2}[.]html|"
pattern4 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{2}[.]html"
pattern = pattern1 + pattern2 + pattern3 + pattern4
include = re.compile(pattern)
print("getting all title urls in the %dth forum board" % forum_index)
tmp_url_set = reptile.get_title_url(parser.url_set, home, include)
forum_index = forum_index + 1
title_url_set = title_url_set ^ tmp_url_set
# 獲取目標url
target_index = 1
title_index = 1
filepath = "d:/url.txt"
for title_url in title_url_set:
print("processing the %dth title url" % title_index)
page = reptile.get_page(title_url, headers)
parser.feed(str(page))
# 保存目標url
with open(filepath, "a") as f:
while len(parser.url_set) > 0:
url = parser.url_set.pop()
pattern = "http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9]{5}"
include = re.compile(pattern)
flag = re.findall(include, url)
if flag:
print("find target! saving the %dth target url in the %dth title page" % (target_index, title_index))
f.write("the %dth url: %s" % (target_index, url))
target_index = target_index + 1
f.write("\n")
title_index = title_index + 1
print("complete")
結果:
聲明:僅供學習研究使用,請勿用于其它非法用途
總結
以上是生活随笔為你收集整理的python网页爬虫例子_Python 利用Python编写简单网络爬虫实例3的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python识别简单训练模型_Pytho
- 下一篇: python空格_python 空格