當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

爬虫项目(三)---采集最近一日全国各省疫情数据

發(fā)布時間：2023/12/1 编程问答 30 豆豆

生活随笔收集整理的這篇文章主要介紹了爬虫项目(三)---采集最近一日全国各省疫情数据小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

該內(nèi)容出自黑馬程序員教程

采集最近一日全國各省疫情數(shù)據(jù)

當然，數(shù)據(jù)來源仍然是丁香園新型冠狀病毒肺炎疫情實時動態(tài)首頁
url：https://ncov.dxy.cn/ncovh5/view/pneumonia

思路：首先需要先確定全國各省疫情數(shù)據(jù)的位置

全國各省份的疫情數(shù)據(jù)信息都在id="getAreaStat"

步驟：

發(fā)送請求，獲取疫情首頁內(nèi)容

解析疫情首頁內(nèi)容，獲取最近一日各省疫情信息

以json格式保存疫情信息

import requests import re import json from bs4 import BeautifulSoup from tqdm import tqdm#進度條class CoronaSpider(object):def __init__(self):self.home_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'def get_content_from_url(self,url):#根據(jù)URL獲取響應內(nèi)容的字符串數(shù)據(jù)#URL：請求的URL#返回：響應內(nèi)容的字符串response = requests.get(url)return response.content.decode()def parse_home_page(self,home_page): #解析首頁內(nèi)容，獲取解析后的Python數(shù)據(jù)#home_page：首頁內(nèi)容#返回：解析后的Python類型數(shù)據(jù)#2，從疫情首頁提取最近一日各國疫情數(shù)據(jù)soup = BeautifulSoup(home_page,'lxml')script = soup.find(id='getListByCountryTypeService2true')text = script.text#print(text)#3，從疫情數(shù)據(jù)中獲取json格式的字符串json_str = re.findall(r'\[.+\]',text)[0]#由于中括號是個特殊的字符，需要在前面加個轉義符；最后的結果會存在列表中，故使用[0]來獲取完整json格式#print(json_str)#4，把json格式的字符串轉換為Python類型data = json.loads(json_str)#print(last_day_nature_num)return datadef save(self,data,path):#5，以json格式保存最近一日各國疫情數(shù)據(jù)with open(path,'w') as fp:json.dump(data,fp)#,ensure_ascii=Falsec'''def save(self,data):#5，以json格式保存最近一日各國疫情數(shù)據(jù)with open('yy1.json','w') as fp:json.dump(data,fp)#,ensure_ascii=False'''def crawl_last_day_corona_virus(self):#采集最近一天各國疫情信息#1，發(fā)送請求，獲取首頁內(nèi)容home_page = self.get_content_from_url(self.home_url)#2，解析首頁內(nèi)容，獲取最近一天的各國疫情數(shù)據(jù)last_data_corona_virus = self.parse_home_page(home_page)#3，保存數(shù)據(jù)self.save(last_data_corona_virus,'E:\Jupyter_workspace\study\python\爬蟲\last_day_nature_num111.json')def crawl_corona_virus(self):#采集從01月23號以來的世界各國疫情數(shù)據(jù)#1，加載最近一日各國疫情數(shù)據(jù)#with open('yy1.json') as fp:with open('E:\Jupyter_workspace\study\python\爬蟲\last_day_nature_num111.json') as fp:last_day_corona_virus = json.load(fp)#print(last_day_corona_virus)#定義列表，用于存儲各國從1月23號以來的疫情數(shù)據(jù)corona_virus = []#2，遍歷各國疫情數(shù)據(jù)，獲取從01月23號以來的世界各國疫情的URLfor country in tqdm(last_day_corona_virus,'獲取從01月23號以來的世界各國疫情信息'):statustics_data_url = country['statisticsData']#3，發(fā)送請求，獲取從01月23號以來的世界各國疫情的json字符串statustics_data_json_str = self.get_content_from_url(statustics_data_url)#4，解析各個國家疫情的json字符串，轉化為Python類型數(shù)據(jù)，添加到列表中statustics_data = json.loads(statustics_data_json_str)['data']#print(statustics_data)for one_day in statustics_data:#statustics_data這個數(shù)據(jù)里面沒有國家的一些信息，需要補充上去one_day['provinceName'] = country['provinceName']one_day['countryShortCode'] = country['countryShortCode']#print(statustics_data)corona_virus.extend(statustics_data)#把每個國家的疫情信息statustics_data，都添加到一個大的corona_virus列表里面#5，將該列表以json格式保存從01月23號以來的世界各國疫情數(shù)據(jù)信息self.save(corona_virus,'E:\Jupyter_workspace\study\python\爬蟲\corona_virus.json')def craw_last_day_corona_virus_of_china(self):#采集最近一日國內(nèi)各省疫情數(shù)據(jù)#1，發(fā)送請求，獲取疫情首頁信息home_page = self.get_content_from_url(self.home_url)#2，解析疫情首頁信息，獲取最近一日各省疫情數(shù)據(jù)soup = BeautifulSoup(home_page,'lxml')script = soup.find(id='getAreaStat')text = script.text#print(text)#從疫情數(shù)據(jù)中獲取json格式的字符串json_str = re.findall(r'\[.+\]',text)[0]#由于中括號是個特殊的字符，需要在前面加個轉義符；最后的結果會存在列表中，故使用[0]來獲取完整json格式#print(json_str)#把json格式的字符串轉換為Python類型data = json.loads(json_str)#print(last_day_nature_num)#3，保存疫情數(shù)據(jù)self.save(data,'E:\Jupyter_workspace\study\python\爬蟲\craw_last_day_corona_virus_of_china.json')def run(self):#self.crawl_last_day_corona_virus()#self.crawl_corona_virus()self.craw_last_day_corona_virus_of_china()if __name__ == '__main__':spider = CoronaSpider()spider.run()

很顯然，這里的craw_last_day_corona_virus_of_china和parse_home_page有的步驟是相同的，接下來開始代碼的重構進行優(yōu)化

import requests import re import json from bs4 import BeautifulSoup from tqdm import tqdm#進度條class CoronaSpider(object):def __init__(self):self.home_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'def get_content_from_url(self,url):#根據(jù)URL獲取響應內(nèi)容的字符串數(shù)據(jù)#URL：請求的URL#返回：響應內(nèi)容的字符串response = requests.get(url)return response.content.decode()def parse_home_page(self,home_page,tag_id): #解析首頁內(nèi)容，獲取解析后的Python數(shù)據(jù)#home_page：首頁內(nèi)容#返回：解析后的Python類型數(shù)據(jù)#2，從疫情首頁提取最近一日各國疫情數(shù)據(jù)soup = BeautifulSoup(home_page,'lxml')script = soup.find(id=tag_id)text = script.text#print(text)#3，從疫情數(shù)據(jù)中獲取json格式的字符串json_str = re.findall(r'\[.+\]',text)[0]#由于中括號是個特殊的字符，需要在前面加個轉義符；最后的結果會存在列表中，故使用[0]來獲取完整json格式#print(json_str)#4，把json格式的字符串轉換為Python類型data = json.loads(json_str)#print(last_day_nature_num)return datadef save(self,data,path):#5，以json格式保存最近一日各國疫情數(shù)據(jù)with open(path,'w') as fp:json.dump(data,fp)#,ensure_ascii=Falsec'''def save(self,data):#5，以json格式保存最近一日各國疫情數(shù)據(jù)with open('yy1.json','w') as fp:json.dump(data,fp)#,ensure_ascii=False'''def crawl_last_day_corona_virus(self):#采集最近一天各國疫情信息#1，發(fā)送請求，獲取首頁內(nèi)容home_page = self.get_content_from_url(self.home_url)#2，解析首頁內(nèi)容，獲取最近一天的各國疫情數(shù)據(jù)last_data_corona_virus = self.parse_home_page(home_page,tag_id='getListByCountryTypeService2true')#3，保存數(shù)據(jù)self.save(last_data_corona_virus,'E:\Jupyter_workspace\study\python\爬蟲\last_day_nature_num111.json')def crawl_corona_virus(self):#采集從01月23號以來的世界各國疫情數(shù)據(jù)#1，加載最近一日各國疫情數(shù)據(jù)#with open('yy1.json') as fp:with open('E:\Jupyter_workspace\study\python\爬蟲\last_day_nature_num111.json') as fp:last_day_corona_virus = json.load(fp)#print(last_day_corona_virus)#定義列表，用于存儲各國從1月23號以來的疫情數(shù)據(jù)corona_virus = []#2，遍歷各國疫情數(shù)據(jù)，獲取從01月23號以來的世界各國疫情的URLfor country in tqdm(last_day_corona_virus,'獲取從01月23號以來的世界各國疫情信息'):statustics_data_url = country['statisticsData']#3，發(fā)送請求，獲取從01月23號以來的世界各國疫情的json字符串statustics_data_json_str = self.get_content_from_url(statustics_data_url)#4，解析各個國家疫情的json字符串，轉化為Python類型數(shù)據(jù)，添加到列表中statustics_data = json.loads(statustics_data_json_str)['data']#print(statustics_data)for one_day in statustics_data:#statustics_data這個數(shù)據(jù)里面沒有國家的一些信息，需要補充上去one_day['provinceName'] = country['provinceName']one_day['countryShortCode'] = country['countryShortCode']#print(statustics_data)corona_virus.extend(statustics_data)#把每個國家的疫情信息statustics_data，都添加到一個大的corona_virus列表里面#5，將該列表以json格式保存從01月23號以來的世界各國疫情數(shù)據(jù)信息self.save(corona_virus,'E:\Jupyter_workspace\study\python\爬蟲\corona_virus.json')def craw_last_day_corona_virus_of_china(self):#采集最近一日國內(nèi)各省疫情數(shù)據(jù)#1，發(fā)送請求，獲取疫情首頁信息home_page = self.get_content_from_url(self.home_url)craw_last_day_corona_virus_of_china = self.parse_home_page(home_page,tag_id='getAreaStat')'''#2，解析疫情首頁信息，獲取最近一日各省疫情數(shù)據(jù)soup = BeautifulSoup(home_page,'lxml')script = soup.find(id='getAreaStat')text = script.text#print(text)#從疫情數(shù)據(jù)中獲取json格式的字符串json_str = re.findall(r'\[.+\]',text)[0]#由于中括號是個特殊的字符，需要在前面加個轉義符；最后的結果會存在列表中，故使用[0]來獲取完整json格式#print(json_str)#把json格式的字符串轉換為Python類型data = json.loads(json_str)#print(last_day_nature_num)'''#3，保存疫情數(shù)據(jù)self.save(craw_last_day_corona_virus_of_china,'E:\Jupyter_workspace\study\python\爬蟲\craw_last_day_corona_virus_of_china.json')def run(self):#self.crawl_last_day_corona_virus()#self.crawl_corona_virus()self.craw_last_day_corona_virus_of_china()if __name__ == '__main__':spider = CoronaSpider()spider.run()

raw_last_day_corona_virus_of_china.json文件內(nèi)容如下：

這里的編碼格式?jīng)]有改變，故各個國家的漢字名稱沒有出現(xiàn)

https://file1.dxycdn.com/2020/0223/331/3398299755968040033-135.json該json文件中存放著全國各個省的疫情數(shù)據(jù)信息。
爬蟲項目(四)中會用到該信息。

總結

以上是生活随笔為你收集整理的爬虫项目(三)---采集最近一日全国各省疫情数据的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。

上一篇：爬虫项目(二)---采集从03月02号以
下一篇：颐和园残疾证怎么预约