21天打造分布式爬虫-Spider类爬取糗事百科(七)
生活随笔
收集整理的這篇文章主要介紹了
21天打造分布式爬虫-Spider类爬取糗事百科(七)
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
7.1.糗事百科
安裝
pip install pypiwin32
pip installTwisted-18.7.0-cp36-cp36m-win_amd64.whl
pip install scrapy
創(chuàng)建和運(yùn)行項(xiàng)目
scrapy startproject qsbk #創(chuàng)建項(xiàng)目 scrapy genspider qsbk_spider "qiushibaike.com" #創(chuàng)建爬蟲 scrapy crawl qsbk_spider #運(yùn)行爬蟲
代碼
qsbk_spider.py
# -*- coding: utf-8 -*-
import scrapy
from qsbk.items import QsbkItem
class QsbkSpiderSpider(scrapy.Spider):
name = 'qsbk_spider'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/8hr/page/1/']
base_domain = "https://www.qiushibaike.com"
def parse(self, response):
duanzidivs = response.xpath("http://div[@id='content-left']/div")
for duanzidiv in duanzidivs:
author = duanzidiv.xpath(".//h2/text()").get().strip()
content = duanzidiv.xpath(".//div[@class='content']//text()").getall()
content = "".join(content).strip()
item = QsbkItem(author=author,content=content)
yield item
#爬后面頁的數(shù)據(jù)
next_url = response.xpath("http://ul[@class='pagination']/li[last()]/a/@href").get()
if not next_url:
return
else:
yield scrapy.Request(self.base_domain+next_url,callback=self.parse)
item.py
import scrapy
class QsbkItem(scrapy.Item):
author = scrapy.Field()
content = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
import json
#1.手動(dòng)把dick轉(zhuǎn)換成json格式
# class QsbkPipeline(object):
# def __init__(self):
# self.fp = open('duanzi.json','w',encoding='utf-8')
#
# def open_spider(self,spider):
# print('開始爬蟲')
#
# def process_item(self, item, spider):
# item_json = json.dumps(dict(item),ensure_ascii=False)
# self.fp.write(item_json+'
')
# return item
#
# def close_spider(self,spider):
# self.fp.close()
# print('爬蟲結(jié)束了')
#2.適用JsonItemExporter,使用與數(shù)據(jù)量小的情況下
# from scrapy.exporters import JsonItemExporter
# class QsbkPipeline(object):
# def __init__(self):
# self.fp = open('duanzi.json','wb')
# self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
# self.exporter.start_exporting()
#
# def open_spider(self,spider):
# print('開始爬蟲')
#
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
#
# def close_spider(self,spider):
# self.exporter.finish_exporting()
# self.fp.close()
# print('爬蟲結(jié)束了')
#3.JsonLinesItemExporter,適用與數(shù)據(jù)量大的情況下
from scrapy.exporters import JsonLinesItemExporter
class QsbkPipeline(object):
def __init__(self):
self.fp = open('duanzi.json','wb')
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def open_spider(self,spider):
print('開始爬蟲')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()
print('爬蟲結(jié)束了')
settings.py
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
}
ITEM_PIPELINES = {
'qsbk.pipelines.QsbkPipeline': 300,
}
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl qsbk_spider".split())
總結(jié)
以上是生活随笔為你收集整理的21天打造分布式爬虫-Spider类爬取糗事百科(七)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 如何为SAP WebIDE开发扩展(Ex
- 下一篇: 导航足球解说怎么设置