當前位置：首頁 > 编程语言 > python >内容正文

python

python网页爬虫例子_Python 利用Python编写简单网络爬虫实例3

發布時間：2025/3/20 python 24 豆豆

生活随笔收集整理的這篇文章主要介紹了 python网页爬虫例子_Python 利用Python编写简单网络爬虫实例3 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

by:授客 QQ：1033553122

實驗環境

python版本：3.3.5（2.7下報錯

實驗目的

獲取目標網站“http://bbs.51testing.com/forum.php”中特定url，通過分析發現，目標url同其它url的關系如下

目標url存在子頁面中的文章中，隨機分布，我們要把它找出來

python腳本

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from urllib.request import *

import gzip, re

from io import BytesIO

from html.parser import HTMLParser

# 爬蟲類

class Reptile:

"""to download web pages"""

def __init__(self):

self.url_set = set() # 用于存儲已下載過的頁面url

self.data = ""

# 下載頁面

def get_page(self, url, headers):

request = Request(url, headers=headers)

request.add_header('Accept-encoding', 'gzip') #下載經過gzip方式壓縮后的網頁，減少網絡流量

try:

response = urlopen(request) # 發送請求報文

if response.code == 200: # 請求成功

page = response.read() # 讀取經壓縮后的頁面

if response.info().get("Content-Encoding") == "gzip":

page_data = BytesIO(page)

gzipper = gzip.GzipFile(fileobj = page_data)

self.data = gzipper.read()

else:

print("gzip unused")

self.data = page_data # 網頁未采用gzip方式壓縮，使用原頁面

except Exception:

pass

self.url_set.add(url)

return self.data

# 獲取論壇目標版塊url

def get_forum_url(self, url_set, home, include):

forum_url_set = set() # 用于存放版塊url

while len(url_set) > 0:

url = url_set.pop()

if re.findall(include, url):

# 讀取的版塊url通常是forum-53-1.html形勢的

url = home + url

forum_url_set.add(url)

return forum_url_set

# 獲取版塊url下的帖子url

def get_title_url(self, url_set, home, include):

title_url_set = set() # 用于存放帖子url

while len(url_set) > 0:

url = url_set.pop()

if re.findall(include, url):

# 讀取的帖子url通常是thread-1044711-1-1.html形式的

url = home + url

title_url_set.add(url)

return title_url_set

# 解析器類

class MyHtmlParser(HTMLParser):

def reset(self):

HTMLParser.reset(self) # 注意順序

self.url_set = set()

def handle_starttag(self, tag, attrs):

#self.url = []

url_list = [value for key, value in attrs if "href" == key]

if url_list:

for url in url_list:

self.url_set.add(url)

##############測試################

# 添加頭域，偽裝瀏覽器訪問網站,防止一些網站拒絕爬蟲訪問

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0"}

init_url = "http://bbs.51testing.com/forum.php"

# 構造解析器

parser = MyHtmlParser(strict = False)

# 下載網頁

page_number = 1

print("program is downloading the frist url page")

reptile = Reptile()

page = reptile.get_page(init_url, headers)

print("processing the %dth url page" % page_number)

# 解析網頁(獲取url)

parser.feed(str(page))

# 獲取分類版塊url

home = "http://bbs.51testing.com/"

# 分成多個變量來寫，主要是為了書寫方便，排版友好

pattern1 = "forum-122-[1-9]|forum-243-[1-9]|forum-40-[1-9]|forum-63-[1-9]"

pattern2 = "|forum-42-[1-9]|forum-53-[1-9]|forum-275-[1-9]|forum-140-[-9]"

pattern3 = "|forum-138-[1-9]|forum-139-[1-9]|forum-141-[1-9]"

pattern = pattern1 + pattern2 + pattern3

include = re.compile(pattern)

forum_url_set = reptile.get_forum_url(parser.url_set, home, include)

# 循環，獲取每個分類版塊下，1-10子版塊的url(前10頁)(翻頁頁面)

result_url_set = set()

forum_index = 1

for forum_url in forum_url_set:

page = reptile.get_page(forum_url, headers)

parser.feed(str(page))

print("getting the board urls in the %dth forum page" % forum_index)

tmp_url_set = reptile.get_forum_url(parser.url_set, home, include)

forum_index = forum_index + 1

result_url_set = result_url_set ^ tmp_url_set

title_url_set = set()

forum_index = 1

title_index = 1

for forum_url in result_url_set:

page = reptile.get_page(forum_url, headers)

parser.feed(str(page))

# 獲取版塊下的帖子url

pattern1 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{1}[.]html|"

pattern2 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{1}[.]html|"

pattern3 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{2}[.]html|"

pattern4 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{2}[.]html"

pattern = pattern1 + pattern2 + pattern3 + pattern4

include = re.compile(pattern)

print("getting all title urls in the %dth forum board" % forum_index)

tmp_url_set = reptile.get_title_url(parser.url_set, home, include)

forum_index = forum_index + 1

title_url_set = title_url_set ^ tmp_url_set

# 獲取目標url

target_index = 1

title_index = 1

filepath = "d:/url.txt"

for title_url in title_url_set:

print("processing the %dth title url" % title_index)

page = reptile.get_page(title_url, headers)

parser.feed(str(page))

# 保存目標url

with open(filepath, "a") as f:

while len(parser.url_set) > 0:

url = parser.url_set.pop()

pattern = "http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9]{5}"

include = re.compile(pattern)

flag = re.findall(include, url)

if flag:

print("find target! saving the %dth target url in the %dth title page" % (target_index, title_index))

f.write("the %dth url: %s" % (target_index, url))

target_index = target_index + 1

f.write("\n")

title_index = title_index + 1

print("complete")

結果：

聲明：僅供學習研究使用，請勿用于其它非法用途

總結

以上是生活随笔為你收集整理的python网页爬虫例子_Python 利用Python编写简单网络爬虫实例3的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： python识别简单训练模型_Pytho
下一篇： python空格_python 空格