爬空气质量MySQL_爬虫:利用selenium采集某某环境网站的空气质量数据
前言:在上一篇文章中,我們介紹了在http://PM2.5.in這個網站采集空氣質量的數據,本篇文章是對其產生的一些問題的另一種解決方案,提供更加權威的數據采集。
技術框架:selenium、json、etree
這里的selenium是一種自動化測試的工具,它可以幫助我們模擬瀏覽器打開網頁并獲取網頁數據,本文之所以選擇這種方式進行,是因為以requests方式直接請求無法獲取到正確的數據,這個網頁的數據是動態加載,需要用戶執行點擊操作才會被請求
我們還是按照常規套路來分析下這個網站,打開F12,看下這個網站的數據請求
可以發現這個網站的數據的請求接口,但當我們直接用requests去請求這個接口,會發現無法獲取正確的數據,原因是這個網站采用了MmEwMD這個值進行了反爬蟲,這個是一個比較常見的反爬蟲措施,他這個值是在發起請求時動態生成的,最簡單的解決這個問題的辦法就是采用selenium之類的模擬瀏覽器方法進行請求,這樣的話,發出的請求也會自動帶上這個參數
請求的代碼如下圖所示
driverPath = 'browser\\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# options.add_argument(('--proxy-server=http://' + ip))
browser = webdriver.Chrome(options=options, executable_path=driverPath)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
browser.get(self.url)
html = browser.page_source
browser.quit()
# print(html)
reponse = etree.HTML(html)
data = reponse.xpath('//body/text()')[0]
json_data = json.loads(data)
我們通過調用谷歌瀏覽器直接請求對應的頁面,獲取到數據后,關閉瀏覽器,通過etree解析網頁結果,通過觀察發現,我們獲取到的數據是json數組,因此我們使用json解析數據,然后將對應的數據存儲到數據庫
result_list = json_data['data']['hour']
print(result_list)
for result in result_list:
item = dict()
item['affect'] = result['AFFECTINFO']
item['action'] = result['SUGGEST']
if('AQIPRIMPOLLUTE' in result):
item['primary_pollutant'] = result['AQIPRIMPOLLUTE']
else:
item['primary_pollutant'] = '無'
item['AQI'] = result['AQI']
item['PM2.5/1h'] = result['PM25']
item['PM10/1h'] = result['PM10']
item['CO/1h'] = result['CO']
item['NO2/1h'] = result['NO2']
item['O3/1h'] = result['O3']
item['O3/8h'] = result['O3_2']
item['SO2/1h'] = result['SO2']
item['city_name'] = result['POINTNAME']
item['level'] = result['CODEAQILEVEL']+'('+result['AQILEVELNAME']+')'
item['live_data_time'] = result['MONITORTIME']
item['live_data_time'] = datetime.datetime.strptime(item['live_data_time'], "%Y年%m月%d日%H")
update_time = item['live_data_time'].strftime('%Y-%m-%d %H:%M:%S')
item['live_data_unit'] = 'μg/m3(CO為mg/m3)'
if(item['city_name'] in city_config):
self.save_mysql(item)
success_count = success_count+1
log_text = '采集的城市:{},采集的結果:{}'.format(item['city_name'],'成功')
self.save_log({'log_type':'0','log_text':log_text})
self.save_log({'log_type':'3','log_text':log_text})
self.update_spider_time(update_time)
# 存儲運行日志
def save_log(self,item):
sql = 'INSERT INTO log(log_text,log_type,created_time) VALUES (%s,%s,%s)'
values = [item['log_text'],item['log_type'],datetime.datetime.now()]
self.cursor.execute(sql,values)
self.conn.commit()
def save_mysql(self,item):
# 查詢數據庫已存在的數據
query_sql = 'select count(1) as count from kongqizhiliang where city_name= %s and live_data_time = %s'
values = [item['city_name'],item['live_data_time']]
self.cursor.execute(query_sql,values)
data = self.cursor.fetchone()
# 如果不存在同一城市同一時刻更新的數據,則新增
if(data['count'] == 0):
sql = ("INSERT kongqizhiliang(city_name,level,live_data_time,live_data_unit,AQI,PM25_1h,PM10_1h,CO_1h"
",NO2_1h,O3_1h,O3_8h,SO2_1h,affect,primary_pollutant,action"
") VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
values =[item['city_name'],item['level'],item['live_data_time'],item['live_data_unit'],item['AQI']
,item['PM2.5/1h'],item['PM10/1h'],item['CO/1h'],item['NO2/1h'],item['O3/1h'],item['O3/8h']
,item['SO2/1h'],item['affect'],item['primary_pollutant'],item['action']]
self.cursor.execute(sql,values)
self.conn.commit()
其實當初這個反爬蟲措施也困擾了我一段時間的,我這里采用的是最簡單的方法解決,雖然效率不高,但能解決我的需求
完整代碼如下:其中部分代碼是可以不需要的,必須redis和config那個,你們自己改一下,不會的可以問我,這個是當時給別人畢設做的,還有其他功能,所以會有一些其他的
"""
采集空氣質量的數據
目標網站:http://sthjt.hubei.gov.cn/hjsj/
"""
import requests
from lxml import etree
import re
from xpinyin import Pinyin
import pymysql
import sys
from settings.config import *
from utils import RedisUtil
import datetime
import json
from selenium import webdriver
class kongqizhiliang:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
url = 'http://sthjt.hubei.gov.cn/wcmapi/service/aqi.xhtml'
redis_key = 'kongqi:config_city'
update_time = 'kongqi:update_time'
# 漢字轉拼音
pinyin = Pinyin()
def __init__(self):
self.conn = pymysql.connect(host=host, port=port, user=user, passwd=passwd, db=db, charset=charset)
self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
# 將城市名轉化為code
def get_code(self,city_name):
return self.pinyin.get_pinyin(city_name, '' )
def get_city_config(self):
redis_util = RedisUtil.get_redis()
city_list = redis_util.list_get_range(self.redis_key)
return city_list
def update_spider_time(self,update_time):
redis_util = RedisUtil.get_redis()
redis_util.str_set(self.update_time,update_time)
def get_data(self):
city_config = self.get_city_config()
log_text = '采集開始,準備采集的城市:{},計劃采集的數據量:{}'.format(city_config,len(city_config))
self.save_log({'log_type':'2','log_text':log_text})
success_count = 0
update_time = ''
driverPath = 'browser\\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# options.add_argument(('--proxy-server=http://' + ip))
browser = webdriver.Chrome(options=options, executable_path=driverPath)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
browser.get(self.url)
html = browser.page_source
browser.quit()
# print(html)
reponse = etree.HTML(html)
data = reponse.xpath('//body/text()')[0]
json_data = json.loads(data)
# print(json_data)
result_list = json_data['data']['hour']
print(result_list)
for result in result_list:
item = dict()
item['affect'] = result['AFFECTINFO']
item['action'] = result['SUGGEST']
if('AQIPRIMPOLLUTE' in result):
item['primary_pollutant'] = result['AQIPRIMPOLLUTE']
else:
item['primary_pollutant'] = '無'
item['AQI'] = result['AQI']
item['PM2.5/1h'] = result['PM25']
item['PM10/1h'] = result['PM10']
item['CO/1h'] = result['CO']
item['NO2/1h'] = result['NO2']
item['O3/1h'] = result['O3']
item['O3/8h'] = result['O3_2']
item['SO2/1h'] = result['SO2']
item['city_name'] = result['POINTNAME']
item['level'] = result['CODEAQILEVEL']+'('+result['AQILEVELNAME']+')'
item['live_data_time'] = result['MONITORTIME']
item['live_data_time'] = datetime.datetime.strptime(item['live_data_time'], "%Y年%m月%d日%H")
update_time = item['live_data_time'].strftime('%Y-%m-%d %H:%M:%S')
item['live_data_unit'] = 'μg/m3(CO為mg/m3)'
if(item['city_name'] in city_config):
self.save_mysql(item)
success_count = success_count+1
log_text = '采集的城市:{},采集的結果:{}'.format(item['city_name'],'成功')
self.save_log({'log_type':'0','log_text':log_text})
self.save_log({'log_type':'3','log_text':log_text})
self.update_spider_time(update_time)
# 存儲運行日志
def save_log(self,item):
sql = 'INSERT INTO log(log_text,log_type,created_time) VALUES (%s,%s,%s)'
values = [item['log_text'],item['log_type'],datetime.datetime.now()]
self.cursor.execute(sql,values)
self.conn.commit()
def save_mysql(self,item):
# 查詢數據庫已存在的數據
query_sql = 'select count(1) as count from kongqizhiliang where city_name= %s and live_data_time = %s'
values = [item['city_name'],item['live_data_time']]
self.cursor.execute(query_sql,values)
data = self.cursor.fetchone()
# 如果不存在同一城市同一時刻更新的數據,則新增
if(data['count'] == 0):
sql = ("INSERT kongqizhiliang(city_name,level,live_data_time,live_data_unit,AQI,PM25_1h,PM10_1h,CO_1h"
",NO2_1h,O3_1h,O3_8h,SO2_1h,affect,primary_pollutant,action"
") VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
values =[item['city_name'],item['level'],item['live_data_time'],item['live_data_unit'],item['AQI']
,item['PM2.5/1h'],item['PM10/1h'],item['CO/1h'],item['NO2/1h'],item['O3/1h'],item['O3/8h']
,item['SO2/1h'],item['affect'],item['primary_pollutant'],item['action']]
self.cursor.execute(sql,values)
self.conn.commit()
if __name__ == "__main__":
app = kongqizhiliang()
app.get_data()
本文首發于爬蟲:利用selenium采集某某環境網站的空氣質量數據?www.bizhibihui.com
總結
以上是生活随笔為你收集整理的爬空气质量MySQL_爬虫:利用selenium采集某某环境网站的空气质量数据的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 常见排序之——插入排序
- 下一篇: Spring Boot——@Config