python 中的爬虫· scrapy框架 重要的组件的介绍
一 。? 去重的規則組件??
去重數據,中通過set() 去重的, 留下的數據存在redis 中,
找到這個類? : from scrapy.dupefilter import RFPDupeFilter??
這個類給url 添加一個唯一的標識:
from scrapy.utils.request import request_fingerprint
補充:調度器中有一段代碼來規定def enqueue_request(self, request):# dont_filter=True, => False -> 添加到去重規則:False,True# dont_filter=False, => True -> 添加到去重規則: False,Trueif not request.dont_filter and self.df.request_seen(request):return False# 添加到調度器dqok = self._dqpush(request)二 。調度器
1. 廣度優先 (本質就是棧)
2.深度優先 (本質就是隊列)
3. 優先級隊列 (redis的有序集合)
三? 下載中間件
這個中間件事 調度器 于 下載器之間的中間件。
a. scrapy中下載中間件的作用?統一對所有請求批量對request對象進行下載前的預處理。b. 針對user-agent,默認中間件 內置的默認的執行, 獲取的是stettings 中自己配置的user-agentclass UserAgentMiddleware(object):"""This middleware allows spiders to override the user_agent"""def __init__(self, user_agent='Scrapy'):self.user_agent = user_agent # USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'@classmethoddef from_crawler(cls, crawler):o = cls(crawler.settings['USER_AGENT'])crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)return odef spider_opened(self, spider):self.user_agent = getattr(spider, 'user_agent', self.user_agent)def process_request(self, request, spider):if self.user_agent:request.headers.setdefault(b'User-Agent', self.user_agent)c. 關于重定向 內置對的默認的class BaseRedirectMiddleware(object):enabled_setting = 'REDIRECT_ENABLED'def __init__(self, settings):if not settings.getbool(self.enabled_setting):raise NotConfiguredself.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')@classmethoddef from_crawler(cls, crawler):return cls(crawler.settings)def _redirect(self, redirected, request, spider, reason):ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)redirects = request.meta.get('redirect_times', 0) + 1if ttl and redirects <= self.max_redirect_times:redirected.meta['redirect_times'] = redirectsredirected.meta['redirect_ttl'] = ttl - 1redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \[request.url]redirected.dont_filter = request.dont_filterredirected.priority = request.priority + self.priority_adjustlogger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",{'reason': reason, 'redirected': redirected, 'request': request},extra={'spider': spider})return redirectedelse:logger.debug("Discarding %(request)s: max redirections reached",{'request': request}, extra={'spider': spider})raise IgnoreRequest("max redirections reached")def _redirect_request_using_get(self, request, redirect_url):redirected = request.replace(url=redirect_url, method='GET', body='')redirected.headers.pop('Content-Type', None)redirected.headers.pop('Content-Length', None)return redirectedclass RedirectMiddleware(BaseRedirectMiddleware):"""Handle redirection of requests based on response statusand meta-refresh html tag."""def process_response(self, request, response, spider):if (request.meta.get('dont_redirect', False) orresponse.status in getattr(spider, 'handle_httpstatus_list', []) orresponse.status in request.meta.get('handle_httpstatus_list', []) orrequest.meta.get('handle_httpstatus_all', False)):return responseallowed_status = (301, 302, 303, 307, 308)if 'Location' not in response.headers or response.status not in allowed_status:return responselocation = safe_url_string(response.headers['location'])redirected_url = urljoin(request.url, location)if response.status in (301, 307, 308) or request.method == 'HEAD':redirected = request.replace(url=redirected_url)return self._redirect(redirected, request, spider, response.status)redirected = self._redirect_request_using_get(request, redirected_url)return self._redirect(redirected, request, spider, response.status)d. 關于cookie 是內置的默認的就執行
用法 自己寫的邏輯里 yield 加上meta={“cookieJar”:1}}:
def start_requests(self):
for url in self.start_urls:
yield Request(url=url,callback=self.parse,meta={"cookieJar":1})
class CookiesMiddleware(object):"""This middleware enables working with sites that need cookies"""def __init__(self, debug=False):self.jars = defaultdict(CookieJar)self.debug = debug@classmethoddef from_crawler(cls, crawler):if not crawler.settings.getbool('COOKIES_ENABLED'):raise NotConfiguredreturn cls(crawler.settings.getbool('COOKIES_DEBUG'))def process_request(self, request, spider):if request.meta.get('dont_merge_cookies', False):return# cookiejarkey = 1cookiejarkey = request.meta.get("cookiejar")jar = self.jars[cookiejarkey] # CookieJar對象-> 空容器cookies = self._get_request_cookies(jar, request)for cookie in cookies:jar.set_cookie_if_ok(cookie, request)# set Cookie headerrequest.headers.pop('Cookie', None)jar.add_cookie_header(request)self._debug_cookie(request, spider)def process_response(self, request, response, spider):if request.meta.get('dont_merge_cookies', False):return response# extract cookies from Set-Cookie and drop invalid/expired cookiescookiejarkey = request.meta.get("cookiejar")jar = self.jars[cookiejarkey]jar.extract_cookies(response, request)self._debug_set_cookie(response, spider)return responsedef _debug_cookie(self, request, spider):if self.debug:cl = [to_native_str(c, errors='replace')for c in request.headers.getlist('Cookie')]if cl:cookies = "\n".join("Cookie: {}\n".format(c) for c in cl)msg = "Sending cookies to: {}\n{}".format(request, cookies)logger.debug(msg, extra={'spider': spider})def _debug_set_cookie(self, response, spider):if self.debug:cl = [to_native_str(c, errors='replace')for c in response.headers.getlist('Set-Cookie')]if cl:cookies = "\n".join("Set-Cookie: {}\n".format(c) for c in cl)msg = "Received cookies from: {}\n{}".format(response, cookies)logger.debug(msg, extra={'spider': spider})def _format_cookie(self, cookie):# build cookie stringcookie_str = '%s=%s' % (cookie['name'], cookie['value'])if cookie.get('path', None):cookie_str += '; Path=%s' % cookie['path']if cookie.get('domain', None):cookie_str += '; Domain=%s' % cookie['domain']return cookie_strdef _get_request_cookies(self, jar, request):if isinstance(request.cookies, dict):cookie_list = [{'name': k, 'value': v} for k, v in \six.iteritems(request.cookies)]else:cookie_list = request.cookiescookies = [self._format_cookie(x) for x in cookie_list]headers = {'Set-Cookie': cookies}response = Response(request.url, headers=headers)return jar.make_cookies(response, request)默認中間件:DOWNLOADER_MIDDLEWARES_BASE = {# Engine side'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400,'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,# Downloader side}
?
? 注意點:
process_request? ?不用返回,
1.?如果 有返回response,就會找最后一個process—ressponse
2. 如果返回request , 就到直接根據返回的request 到調度器中執行
process_response:必須有返回值
?
四? 。 爬蟲中間件
下載器組件 到 爬蟲組件中間件,
默認有 優先級的中間件 和 深度的中間件
編寫中間件class XzxSpiderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the# passed objects.@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()return sdef process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, dict or Item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Response, dict# or Item objects.passdef process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield r配置文件:SPIDER_MIDDLEWARES = {'xzx.middlewares.XzxSpiderMiddleware': 543,}內置爬蟲中間件 settings 中的配置 :深度 :DEPTH_LIMIT = 8優先級DEPTH_PRIORITY = 1, 請求的優先級:0 -1 -2 -3 。。。。DEPTH_PRIORITY = -1,請求的優先級:0 1 2 3 。。。。SPIDER_MIDDLEWARES_BASE = {# Engine side'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,# Spider side}
總結:
1. DupeFilter
- 默認放在set集合
- url變更為唯一標記
- 將去重規則放到redis中的意義何在?
- 去重+dont_filter
2. 調度器
- 爬蟲中什么是深度和廣度優先?
- 用什么可以實現?
- 棧
- 隊列
- 優先級集合
3,開放封閉原則:
對源碼封閉,對配置文件開放, 通過修改配置文件,實現自己想要的功能.
轉載于:https://www.cnblogs.com/xuerh/p/9348849.html
總結
以上是生活随笔為你收集整理的python 中的爬虫· scrapy框架 重要的组件的介绍的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 多项式多点求值
- 下一篇: TypeScript 3.3来了!快看看