scrapy爬个小网站
生活随笔
收集整理的這篇文章主要介紹了
scrapy爬个小网站
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
本文使用scrapy對某一個網站靜態數據進行了抓取
# -*- coding: utf-8 -*- import scrapy from scrapy.http import request import requests import os import sys reload(sys) sys.setdefaultencoding('utf-8')#中文字符不能被識別報錯 class spider(scrapy.Spider):name='picSpider'allowed_domains=[]urls=[]for i in range(1,400):if i==1:urls.append('http://www.***.com/pic/12/')else:urls.append('http://www.***.com/pic/12/p_'+str(i)+'.html')start_urls=urlsdef parse(self, response):#title=response.xpath("//div[@class='box list channel']/ul/li/a/text()").extract()link=response.xpath("//div[@class='box list channel']/ul/li/a/@href").extract()for l in link:url='http://www.***.com'+lre=scrapy.Request(url,callback=self.parse_page)#子頁面2層爬yield redef parse_page(self, response):title=response.xpath("//h1/text()").extract()#名字path=os.path.join('d:/dd',title[0])if os.path.exists(path) is False:os.mkdir(path)for i in response.xpath("//div[@class='post']/img/@src").extract():name=os.path.join(path,i.split('/')[-1])pic=requests.get(i,timeout=10)f=open(name,'wb')f.write(pic.content)f.close()
# -*- coding: utf-8 -*- import scrapy from scrapy.http import request import requests import os import sys reload(sys) sys.setdefaultencoding('utf-8')#中文字符不能被識別報錯 class spider(scrapy.Spider):name='picSpider'allowed_domains=[]urls=[]for i in range(1,400):if i==1:urls.append('http://www.***.com/pic/12/')else:urls.append('http://www.***.com/pic/12/p_'+str(i)+'.html')start_urls=urlsdef parse(self, response):#title=response.xpath("//div[@class='box list channel']/ul/li/a/text()").extract()link=response.xpath("//div[@class='box list channel']/ul/li/a/@href").extract()for l in link:url='http://www.***.com'+lre=scrapy.Request(url,callback=self.parse_page)#子頁面2層爬yield redef parse_page(self, response):title=response.xpath("//h1/text()").extract()#名字path=os.path.join('d:/dd',title[0])if os.path.exists(path) is False:os.mkdir(path)for i in response.xpath("//div[@class='post']/img/@src").extract():name=os.path.join(path,i.split('/')[-1])pic=requests.get(i,timeout=10)f=open(name,'wb')f.write(pic.content)f.close()
轉載于:https://www.cnblogs.com/giserpan/p/6916093.html
總結
以上是生活随笔為你收集整理的scrapy爬个小网站的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 一些芯片资料
- 下一篇: Codeforces Round #14