生活随笔
收集整理的這篇文章主要介紹了
自然基金项目爬虫测试(已失效)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
一年前寫的小爬蟲,用的自動化測試。還是可以自動登錄,但網站現在好像不向普通用戶提供查詢服務了。
寫了一次不容易,代碼還是保存在這里。
from selenium
import webdriver
from selenium
.webdriver
.support
.ui
import WebDriverWait
from selenium
.webdriver
.common
.by
import By
from selenium
.webdriver
.support
.select
import Select
from selenium
.webdriver
.common
.alert
import Alert
from selenium
.webdriver
.common
.action_chains
import ActionChains
import time
as t
import re
from bs4
import BeautifulSoup
import xlrd
import xlwt
import os
import random
driver
= webdriver
.Firefox
()
driver
.maximize_window
()
driver
.get
('http://fund.sciencenet.cn/')def analyse_most(html
):"""進行網頁解析"""soup
= BeautifulSoup
(html
,'html.parser') titles
=[]authors
=[]inses
=[]sorts
=[]nums
=[]years
=[]mons
=[]keys
=[]title
=soup
.select
('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item p.t a')author
=soup
.select
('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item div.d p.ico span.author i')ins
=[]sort
=[]num
=[]year
=[]mon
=[]key
=[]for i
in range(1,11):ins_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(1) > span:nth-child(2) > i:nth-child(1)')sort_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(1) > i:nth-child(3)')num_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(1) > b:nth-child(4)')year_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(1) > span:nth-child(5) > b:nth-child(1)')mon_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(2) > span:nth-child(1) > b:nth-child(1)')key_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(2) > span:nth-child(2) > i:nth-child(1)')ins
.append
(ins_1
)sort
.append
(sort_1
)num
.append
(num_1
)year
.append
(year_1
)mon
.append
(mon_1
)key
.append
(key_1
)clear
(title
,titles
)clear
(author
,authors
)clear_l
(ins
,inses
)clear_l
(sort
,sorts
)clear_l
(num
,nums
)clear_l
(year
,years
)clear_l
(mon
,mons
)clear_l
(key
,keys
)page
=[]for i
in range(len(titles
)):page
.append
(titles
[i
:i
+1]+authors
[i
:i
+1]+inses
[i
:i
+1]+sorts
[i
:i
+1]+nums
[i
:i
+1]+years
[i
:i
+1]+mons
[i
:i
+1]+keys
[i
:i
+1])return page
def analyse_end(html
):"""進行最后一頁的網頁解析"""soup
= BeautifulSoup
(html
,'html.parser') titles
=[]authors
=[]inses
=[]sorts
=[]nums
=[]years
=[]mons
=[]keys
=[]title
=soup
.select
('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item p.t a')author
=soup
.select
('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item div.d p.ico span.author i')ins
=[]sort
=[]num
=[]year
=[]mon
=[]key
=[]for i
in range(1,3):ins_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(1) > span:nth-child(2) > i:nth-child(1)')sort_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(1) > i:nth-child(3)')num_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(1) > b:nth-child(4)')year_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(1) > span:nth-child(5) > b:nth-child(1)')mon_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(2) > span:nth-child(1) > b:nth-child(1)')key_1
=soup
.select
('div.item:nth-child('+str(i
)+') > div:nth-child(2) > p:nth-child(2) > span:nth-child(2) > i:nth-child(1)')ins
.append
(ins_1
)sort
.append
(sort_1
)num
.append
(num_1
)year
.append
(year_1
)mon
.append
(mon_1
)key
.append
(key_1
)clear
(title
,titles
)clear
(author
,authors
)clear_l
(ins
,inses
)clear_l
(sort
,sorts
)clear_l
(num
,nums
)clear_l
(year
,years
)clear_l
(mon
,mons
)clear_l
(key
,keys
)page
=[]for i
in range(len(titles
)):page
.append
(titles
[i
:i
+1]+authors
[i
:i
+1]+inses
[i
:i
+1]+sorts
[i
:i
+1]+nums
[i
:i
+1]+years
[i
:i
+1]+mons
[i
:i
+1]+keys
[i
:i
+1])return page
def get_one():"""返回搜索第一頁源碼"""u_1
='15691604654'k_1
='9g8496suDFwqRfg'u_2
='15881650047'k_2
='zqlwx123'driver
.find_element_by_xpath
('/html/body/div[2]/div[1]/div/div[2]/a').click
()t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/div[2]/div[4]/div/div/form/div/div[1]/input').send_keys
(u_1
)t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/div[2]/div[4]/div/div/form/div/div[2]/input').send_keys
(k_1
)t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/div[2]/div[4]/div/div/form/div/div[4]/button').click
()t
.sleep
(2)driver
.find_element_by_xpath
('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[2]/div[2]/span[2]/span[1]/span/span[1]').click
()t
.sleep
(2)driver
.find_element_by_xpath
('/html/body/dialog/bd/div[1]/div[1]/ul/li[8]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/dialog/bd/div[1]/div[1]/div/ul/li[2]/label').click
()t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/dialog/bd/div[1]/div[1]/div/div/ul/li[6]/label').click
()t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/dialog/bd/div[2]/button').click
()t
.sleep
(3)driver
.find_element_by_xpath
('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[1]/div[3]/select[1]').click
()driver
.find_element_by_xpath
('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[1]/div[3]/select[1]/option[24]').click
()t
.sleep
(3)driver
.find_element_by_xpath
('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[1]/div[3]/select[2]')driver
.find_element_by_xpath
('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[1]/div[3]/select[2]/option[4]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[2]/button').click
()html
= driver
.page_source
return html
def get_one_mian():get_one
()driver
.find_element_by_xpath
('//*[@id="category"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="面上項目"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[2]/button').click
()html
=driver
.page_source
return html
def get_one_qing():get_one
()driver
.find_element_by_xpath
('//*[@id="category"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="青年科學基金項目"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[2]/button').click
()html
=driver
.page_source
return html
def get_one_others():get_one
()driver
.find_element_by_xpath
('//*[@id="category"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="地區科學基金項目"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="專項基金項目"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="國際(地區)合作與交流項目"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="重點項目"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="應急管理項目"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="優秀青年科學基金項目"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="海外及港澳學者合作研究基金"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="重大項目"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="國家杰出青年科學基金"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('//*[@id="重大研究計劃"]').click
()t
.sleep
(1)driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[2]/button').click
()html
=driver
.page_source
return html
def get_fore():"""前6頁的翻頁,并返回源碼"""driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[13]/a').click
()html
=driver
.page_source
return html
def get_fore_others():"""前6頁的翻頁,并返回源碼"""driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click
()html
=driver
.page_source
return html
def get_mid():"""中間的翻頁,并返回源碼"""driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[15]/a ').click
()html
=driver
.page_source
return html
def get_after():"""最后的翻頁,并返回源碼"""driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[14]/a ').click
()html
=driver
.page_source
return html
def clear(old_list
,new_list
):"""用于清洗出純文本"""for i
in old_list
:n
=(i
.text
).strip
()n
=n
.replace
('\n',' ')new_list
.append
(n
)return new_list
def clear_l(old_list
,new_list
):"""用于清洗出resultset的純文本"""for i
in old_list
:i
=i
[0]n
=(i
.text
).strip
()n
=n
.replace
('\n',' ')new_list
.append
(n
)return new_list
def save_afile(alls
,count
):os
.chdir
(r'F:\會計學基金項目數據代碼\雜項基金')"""將一頁的基金數據保存在一個excle"""f
=xlwt
.Workbook
()sheet1
=f
.add_sheet
(u'sheet1',cell_overwrite_ok
=True)sheet1
.write
(0,0,'項目名稱')sheet1
.write
(0,1,'負責人')sheet1
.write
(0,2,'申請單位')sheet1
.write
(0,3,'研究類型')sheet1
.write
(0,4,'項目批準號')sheet1
.write
(0,5,'批準年度')sheet1
.write
(0,6,'資助金額')sheet1
.write
(0,7,'關鍵詞')i
=1for data
in alls
:for j
in range(len(data
)):sheet1
.write
(i
,j
,data
[j
])i
=i
+1f
.save
(str(count
)+'.xls')print(str(count
)+'保存成功!')
def wait():"""返回隨機等待時間"""s
=t
.sleep
(random
.randint
(2,9))return s
def wait_l():s
=t
.sleep
(random
.randint
(1,5))return s
def goto_10_mian():get_one_mian
()wait_l
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[9]/a').click
()wait_l
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[10]/a').click
()
def goto_13_qing():get_one_qing
()wait_l
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[9]/a').click
()wait_l
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click
()wait_l
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[10]/a').click
()
def goto_20_mian():get_one_mian
()wait_l
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[9]/a').click
()for i
in range(4):wait_l
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click
()
def goto_22_qing():get_one_qing
()wait_l
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[9]/a').click
()for i
in range(4):wait_l
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click
()driver
.find_element_by_xpath
('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click
()
if __name__
== '__main__':for i
in range(1,10):if i
==1:save_afile
(analyse_most
(get_one_others
()),i
)elif 1<i
<=8:save_afile
(analyse_most
(get_fore_others
()),i
)else:save_afile
(analyse_end
(get_fore_others
()),i
)"""if i==1:save_afile(analyse_most(get_one_mian()),i)elif 1<i<=7:wait()save_afile(analyse_most(get_fore()),i)"""
總結
以上是生活随笔為你收集整理的自然基金项目爬虫测试(已失效)的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。