scrapy-redis 分布式哔哩哔哩网站用户爬虫
生活随笔
收集整理的這篇文章主要介紹了
scrapy-redis 分布式哔哩哔哩网站用户爬虫
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
scrapy里面,對每次請求的url都有一個指紋,這個指紋就是判斷url是否被請求過的。默認是開啟指紋即一個URL請求一次。如果我們使用分布式在多臺機上面爬取數據,為了讓爬蟲的數據不重復,我們也需要一個指紋。但是scrapy默認的指紋是保持到本地的。所有我們可以使用redis來保持指紋,并且用redis里面的set集合來判斷是否重復。
setting.py
# -*- coding: utf-8 -*-# Scrapy settings for bilibili project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'bilibili'SPIDER_MODULES = ['bilibili.spiders'] NEWSPIDER_MODULE = 'bilibili.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'bilibili (+http://www.yourdomain.com)'# Obey robots.txt rules # ROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = False# Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False# Override the default request headers: DEFAULT_REQUEST_HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'en', }# Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'bilibili.middlewares.BilibiliSpiderMiddleware': 543, #}# Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = {'bilibili.middlewares.BilibiliDownloaderMiddleware': 543,'bilibili.middlewares.randomUserAgentMiddleware':400 }# Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #}# Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {'bilibili.pipelines.BilibiliPipeline': 300,'scrapy_redis.pipelines.RedisPipeline':300 }# Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'SCHEDULER = 'scrapy_redis.scheduler.Scheduler' DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' REDIS_URL = 'redis://@127.0.0.1:6379' SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'spider.py
# -*- coding: utf-8 -*- import scrapy import json,re from bilibili.items import BilibiliItemclass BilibiliappSpider(scrapy.Spider):name = 'bilibiliapp'# allowed_domains = ['www.bilibili.com']# start_urls = ['http://www.bilibili.com/']def start_requests(self):for i in range(1, 300):url = 'https://api.bilibili.com/x/relation/stat?vmid={}&jsonp=jsonp&callback=__jp3'.format(i)url_ajax = 'https://space.bilibili.com/{}/'.format(i)# get的時候是這個東東, scrapy.Request(url=, callback=)req = scrapy.Request(url=url,callback=self.parse,meta={'id':i})req.headers['referer'] = url_ajaxyield reqdef parse(self, response):# print(response.text)comm = re.compile(r'({.*})')text = re.findall(comm,response.text)[0]data = json.loads(text)# print(data)follower = data['data']['follower']following = data['data']['following']id = response.meta.get('id')url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&page=1&pagesize=25'.format(id)yield scrapy.Request(url=url,callback=self.getsubmit,meta={'id':id,'follower':follower,'following':following})def getsubmit(self, response):# print(response.text)data = json.loads(response.text)tilst = data['data']['tlist']tlist_list = []if tilst != []:# print(tilst)for tils in tilst.values():# print(tils['name'])tlist_list.append(tils['name'])else:tlist_list = ['無愛好']follower = response.meta.get('follower')following = response.meta.get('following')id = response.meta.get('id')url = 'https://api.bilibili.com/x/space/acc/info?mid={}&jsonp=jsonp'.format(id)yield scrapy.Request(url=url,callback=self.space,meta={'id':id,'follower':follower,'following':following,'tlist_list':tlist_list})def space(self, respinse):# print(respinse.text)data = json.loads(respinse.text)name = data['data']['name']sex = data['data']['sex']level = data['data']['level']birthday = data['data']['birthday']tlist_list = respinse.meta.get('tlist_list')animation = 0Life = 0Music = 0Game = 0Dance = 0Documentary = 0Ghost = 0science = 0Opera = 0entertainment = 0Movies = 0National = 0Digital = 0fashion = 0for tlist in tlist_list:if tlist == '動畫':animation = 1elif tlist == '生活':Life = 1elif tlist == '音樂':Music = 1elif tlist == '游戲':Game = 1elif tlist == '舞蹈':Dance = 1elif tlist == '紀錄片':Documentary = 1elif tlist == '鬼畜':Ghost = 1elif tlist == '科技':science = 1elif tlist == '番劇':Opera =1elif tlist == '娛樂':entertainment = 1elif tlist == '影視':Movies = 1elif tlist == '國創':National = 1elif tlist == '數碼':Digital = 1elif tlist == '時尚':fashion = 1item = BilibiliItem()item['name'] = nameitem['sex'] = sexitem['level'] = levelitem['birthday'] = birthdayitem['follower'] = respinse.meta.get('follower')item['following'] = respinse.meta.get('following')item['animation'] = animationitem['Life'] = Lifeitem['Music'] = Musicitem['Game'] = Gameitem['Dance'] = Danceitem['Documentary'] = Documentaryitem['Ghost'] = Ghostitem['science'] = scienceitem['Opera'] = Operaitem['entertainment'] = entertainmentitem['Movies'] = Moviesitem['National'] = Nationalitem['Digital'] = Digitalitem['fashion'] = fashionyield item設置ua池
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware import randomclass randomUserAgentMiddleware(UserAgentMiddleware):def __init__(self,user_agent=''):self.user_agent = user_agentdef process_request(self, request, spider):ua = random.choice(self.user_agent_list)if ua:request.headers.setdefault('User-Agent', ua)user_agent_list = [ \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]git地址:https://github.com/18370652038/scrapy-bilibili
轉載于:https://www.cnblogs.com/dayouzi/p/10889789.html
總結
以上是生活随笔為你收集整理的scrapy-redis 分布式哔哩哔哩网站用户爬虫的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: [RN] React Native 实现
- 下一篇: 软件工程综合实践 作业 Axure介