Python爬虫实践:获取石家庄空气质量历史数据(13年至今)
生活随笔
收集整理的這篇文章主要介紹了
Python爬虫实践:获取石家庄空气质量历史数据(13年至今)
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import numpy
import csv
import timedef getdatawithtablehead(url):""" 該函數(shù)用于獲取帶表頭的數(shù)據(jù) """html = urlopen(url)bsobj = BeautifulSoup(html, "lxml", from_encoding="gb18030") # 獲取BeautifulSoup對(duì)象tablelist = bsobj.findAll("tr") # 獲取所有的表格Dataset = []tablehead = tablelist[0].get_text().strip("\n").split("\n\n")Dataset.append(tablehead) # 獲取表頭dataset = []for datalist in tablelist[1:]:data = datalist.get_text().replace(" ", "").replace("\r\n", "").strip("\n").split("\n")dataset.append(data) # 獲取當(dāng)月每一天的數(shù)據(jù)Dataset = numpy.row_stack((Dataset, dataset))return Datasetdef getdata(url):""" 該函數(shù)用于獲取不帶表頭的數(shù)據(jù) """html = urlopen(url)bsobj = BeautifulSoup(html, "lxml", from_encoding="gb18030")tablelist = bsobj.findAll("tr")dataset = []for datalist in tablelist[1:]:data = datalist.get_text().replace(" ", "").replace("\r\n", "").strip("\n").split("\n")dataset.append(data)return datasetstart =time.clock()
# 長(zhǎng)沙空氣質(zhì)量指數(shù)(AQI)-PM2.5查詢地址:
starturl = "http://www.tianqihoubao.com/aqi/shijiazhuang.html"
html = urlopen(starturl)
bsobj = BeautifulSoup(html, "lxml") # 獲取BeautifulSoup對(duì)象
# 找到所有存放月度數(shù)據(jù)的網(wǎng)頁(yè)鏈接,并以列表的形式按月份先后順序保存這些鏈接
Sites = []
for link in bsobj.findAll(href=re.compile("^(/aqi/shijiazhuang-)")):site = "http://www.tianqihoubao.com" + link.attrs['href']Sites.append(site)
# 數(shù)組反轉(zhuǎn)
Sites.reverse()
print(Sites)
Dataset = getdatawithtablehead(Sites[0]) # 獲取表頭和第一個(gè)月度數(shù)據(jù)
for url in Sites[1:]:dataset = getdata(url)Dataset = numpy.row_stack((Dataset, dataset)) # 獲取所有月度數(shù)據(jù)csvfile = open("shijiazhuang.csv", "w") # 創(chuàng)建csv文件用于保存數(shù)據(jù)
try:writer = csv.writer(csvfile)for i in range(numpy.shape(Dataset)[0]):writer.writerow((Dataset[i, :])) # 將數(shù)據(jù)逐行寫(xiě)入csv文件
finally:csvfile.close() # 關(guān)閉csv文件end = time.clock()print('Running time: %s Seconds' % (end - start))
原文地址:
https://blog.csdn.net/qq_36185831/article/details/79123144
https://blog.csdn.net/u013337691/article/details/51894453#commentsedit
總結(jié)
以上是生活随笔為你收集整理的Python爬虫实践:获取石家庄空气质量历史数据(13年至今)的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: 笔记本英雄联盟界面服务器停止运行,提示请
- 下一篇: android手机自动打开pdf文件夹,