python: 爬取[博海拾贝]图片脚本
生活随笔
收集整理的這篇文章主要介紹了
python: 爬取[博海拾贝]图片脚本
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
練手代碼,聊作備忘:
# encoding: utf-8 # from __future__ import unicode_literalsimport urllib import urllib2 import re import os import time from threading import Threadclass BhsbSpider(object):_url = r'https://bh.sb/post/category/main/';_page_count = 0_page_index = 0def __init__(self, url, page_count = 0):self._url = urlself._page_count = page_countfolder = '博海拾貝'.decode('utf-8')if not os.path.exists(folder):os.mkdir(folder)def spider(self):while self._page_index < self._page_count:self._page_index += 1self._url = r'https://bh.sb/post/category/main/page/%d' % self._page_indexself.do_spider(self._url)def do_spider(self, url):html = self.get_html(url)pattern = r'(?s)<h2><a\s+href="(?P<url>[^"]+).*?>\[博海拾貝\d+\](?P<title>[^<]+).*?'for i, m in enumerate(re.findall(pattern, html)):info = '%d. url: %s, title: %s' % ((self._page_index - 1) * 20 + i + 1, m[0], m[1])print info# 多線程爬取頁面Thread(target=self.download, args=(m[0], m[1])).start()time.sleep(2)def download(self, url, title):title = '博海拾貝\\' + titletitle = title.decode('utf-8')if not os.path.exists(title):os.mkdir(title)html = self.get_html(url)pattern = r'(?s)<p>(?P<title>[^<]+).*?<p><img\s+src="(?P<image>[^"]+)"'for i, m in enumerate(re.findall(pattern, html)):img_title = m[0]img_url = m[1]img_filename = '%s/%s%s' % (title.encode('utf-8'), img_title, os.path.splitext(img_url)[1])img_filename = img_filename.decode('utf-8')print 'download %s ...' % img_filenameif not os.path.exists(img_filename):Thread(target=urllib.urlretrieve, args=(img_url, img_filename)).start()time.sleep(1)def get_html(self, url):try:url = url.encode('utf-8')req = urllib2.Request(url)req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.5.1000 Chrome/39.0.2146.0 Safari/537.36')page = urllib2.urlopen(req)return page.read()except Exception as ex:print 'get url_%s html error, ex=%s' % (url, ex)if __name__ == '__main__':url = r'https://bh.sb/post/category/main/'bs = BhsbSpider(url, 10)bs.spider()?
未及細測試,其間有圖片丟失情況。結果如下圖示:
轉載于:https://www.cnblogs.com/crwy/p/10623378.html
總結
以上是生活随笔為你收集整理的python: 爬取[博海拾贝]图片脚本的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: leetcode 796. 旋转字符串(
- 下一篇: GT sport赛道详解 - Drago