當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

自然基金项目爬虫测试（已失效）

發布時間：2023/12/20 编程问答 35 豆豆

生活随笔收集整理的這篇文章主要介紹了自然基金项目爬虫测试（已失效）小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

一年前寫的小爬蟲，用的自動化測試。還是可以自動登錄，但網站現在好像不向普通用戶提供查詢服務了。
寫了一次不容易，代碼還是保存在這里。

#coding='utf-8' from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select from selenium.webdriver.common.alert import Alert from selenium.webdriver.common.action_chains import ActionChains import time as t import re from bs4 import BeautifulSoup import xlrd import xlwt import os import random #先進入科學基金網 driver = webdriver.Firefox() driver.maximize_window() driver.get('http://fund.sciencenet.cn/')def analyse_most(html):"""進行網頁解析"""soup = BeautifulSoup(html,'html.parser') #解析器：html.parser#items=soup.select('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item')#一頁的項目#print(items)titles=[]authors=[]inses=[]sorts=[]nums=[]years=[]mons=[]keys=[]#使用路徑title=soup.select('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item p.t a')#項目名author=soup.select('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item div.d p.ico span.author i')#負責人#使用選擇器ins=[]sort=[]num=[]year=[]mon=[]key=[]for i in range(1,11):ins_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(1) > span:nth-child(2) > i:nth-child(1)')#機構sort_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(1) > i:nth-child(3)')#類型num_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(1) > b:nth-child(4)')#批準號year_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(1) > span:nth-child(5) > b:nth-child(1)')#立項年份mon_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(2) > span:nth-child(1) > b:nth-child(1)')#資助金額key_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(2) > span:nth-child(2) > i:nth-child(1)')#關鍵詞ins.append(ins_1)sort.append(sort_1)num.append(num_1)year.append(year_1)mon.append(mon_1)key.append(key_1)clear(title,titles)clear(author,authors)clear_l(ins,inses)clear_l(sort,sorts)clear_l(num,nums)clear_l(year,years)clear_l(mon,mons)clear_l(key,keys)page=[]#本頁的所有信息for i in range(len(titles)):page.append(titles[i:i+1]+authors[i:i+1]+inses[i:i+1]+sorts[i:i+1]+nums[i:i+1]+years[i:i+1]+mons[i:i+1]+keys[i:i+1])return page def analyse_end(html):"""進行最后一頁的網頁解析"""soup = BeautifulSoup(html,'html.parser') #解析器：html.parser#items=soup.select('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item')#一頁的項目#print(items)titles=[]authors=[]inses=[]sorts=[]nums=[]years=[]mons=[]keys=[]#使用路徑title=soup.select('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item p.t a')#項目名author=soup.select('html body div.hom div#wrapper_body div#main.wp div#mainCt div#resultLst.resultLst div.item div.d p.ico span.author i')#負責人#使用選擇器ins=[]sort=[]num=[]year=[]mon=[]key=[]for i in range(1,3):ins_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(1) > span:nth-child(2) > i:nth-child(1)')#機構sort_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(1) > i:nth-child(3)')#類型num_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(1) > b:nth-child(4)')#批準號year_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(1) > span:nth-child(5) > b:nth-child(1)')#立項年份mon_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(2) > span:nth-child(1) > b:nth-child(1)')#資助金額key_1=soup.select('div.item:nth-child('+str(i)+') > div:nth-child(2) > p:nth-child(2) > span:nth-child(2) > i:nth-child(1)')#關鍵詞ins.append(ins_1)sort.append(sort_1)num.append(num_1)year.append(year_1)mon.append(mon_1)key.append(key_1)clear(title,titles)clear(author,authors)clear_l(ins,inses)clear_l(sort,sorts)clear_l(num,nums)clear_l(year,years)clear_l(mon,mons)clear_l(key,keys)page=[]#本頁的所有信息for i in range(len(titles)):page.append(titles[i:i+1]+authors[i:i+1]+inses[i:i+1]+sorts[i:i+1]+nums[i:i+1]+years[i:i+1]+mons[i:i+1]+keys[i:i+1])return page def get_one():"""返回搜索第一頁源碼"""u_1='15691604654'k_1='9g8496suDFwqRfg'u_2='15881650047'k_2='zqlwx123'driver.find_element_by_xpath('/html/body/div[2]/div[1]/div/div[2]/a').click()#隨便點擊以轉到登錄頁t.sleep(1)driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div/form/div/div[1]/input').send_keys(u_1)#鍵入用戶名t.sleep(1)driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div/form/div/div[2]/input').send_keys(k_1)#鍵入密碼t.sleep(1)driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div/form/div/div[4]/button').click()#進入t.sleep(2)driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[2]/div[2]/span[2]/span[1]/span/span[1]').click()#直接點擊t.sleep(2)driver.find_element_by_xpath('/html/body/dialog/bd/div[1]/div[1]/ul/li[8]').click()#選中管理科學部t.sleep(1)driver.find_element_by_xpath('/html/body/dialog/bd/div[1]/div[1]/div/ul/li[2]/label').click()#選中工商管理t.sleep(1)driver.find_element_by_xpath('/html/body/dialog/bd/div[1]/div[1]/div/div/ul/li[6]/label').click()#選中會計t.sleep(1)driver.find_element_by_xpath('/html/body/dialog/bd/div[2]/button').click()#確定t.sleep(3)driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[1]/div[3]/select[1]').click()#點擊起始年份driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[1]/div[3]/select[1]/option[24]').click()#選中2000年t.sleep(3)driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[1]/div[3]/select[2]')#點擊截止年份driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[1]/div[1]/div[3]/select[2]/option[4]').click()#選中2020年t.sleep(1)driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div/div[2]/div[1]/form/div[2]/button').click()#點擊搜索#成功進入搜索結果頁面html= driver.page_source#頁面源碼return html def get_one_mian():#面上項目get_one()#搜索首頁driver.find_element_by_xpath('//*[@id="category"]').click()#進入類型t.sleep(1)driver.find_element_by_xpath('//*[@id="面上項目"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[2]/button').click()#點擊篩選html=driver.page_sourcereturn html def get_one_qing():#青年科學基金項目，共24頁get_one()#搜索首頁driver.find_element_by_xpath('//*[@id="category"]').click()#進入類型t.sleep(1)driver.find_element_by_xpath('//*[@id="青年科學基金項目"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[2]/button').click()#點擊篩選html=driver.page_sourcereturn html def get_one_others():#雜項，共9頁get_one()#搜索首頁driver.find_element_by_xpath('//*[@id="category"]').click()#進入類型t.sleep(1)driver.find_element_by_xpath('//*[@id="地區科學基金項目"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('//*[@id="專項基金項目"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('//*[@id="國際(地區)合作與交流項目"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('//*[@id="重點項目"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('//*[@id="應急管理項目"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('//*[@id="優秀青年科學基金項目"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('//*[@id="海外及港澳學者合作研究基金"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('//*[@id="重大項目"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('//*[@id="國家杰出青年科學基金"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('//*[@id="重大研究計劃"]').click()#選定面上項目t.sleep(1)driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[2]/button').click()#點擊篩選html=driver.page_sourcereturn html def get_fore():"""前6頁的翻頁，并返回源碼"""driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[13]/a').click()#點擊下一頁html=driver.page_sourcereturn html def get_fore_others():"""前6頁的翻頁，并返回源碼"""driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click()#點擊下一頁html=driver.page_sourcereturn htmldef get_mid():"""中間的翻頁，并返回源碼"""driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[15]/a ').click()#點擊下一頁html=driver.page_sourcereturn html def get_after():"""最后的翻頁，并返回源碼"""driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[14]/a ').click()#點擊下一頁html=driver.page_sourcereturn html def clear(old_list,new_list):"""用于清洗出純文本"""for i in old_list:n=(i.text).strip()n=n.replace('\n',' ')new_list.append(n)return new_list def clear_l(old_list,new_list):"""用于清洗出resultset的純文本"""for i in old_list:i=i[0]n=(i.text).strip()n=n.replace('\n',' ')new_list.append(n)return new_list def save_afile(alls,count):os.chdir(r'F:\會計學基金項目數據代碼\雜項基金')"""將一頁的基金數據保存在一個excle"""f=xlwt.Workbook()sheet1=f.add_sheet(u'sheet1',cell_overwrite_ok=True)sheet1.write(0,0,'項目名稱')sheet1.write(0,1,'負責人')sheet1.write(0,2,'申請單位')sheet1.write(0,3,'研究類型')sheet1.write(0,4,'項目批準號')sheet1.write(0,5,'批準年度')sheet1.write(0,6,'資助金額')sheet1.write(0,7,'關鍵詞')i=1for data in alls:#遍歷每一行for j in range(len(data)):#取每一單元格sheet1.write(i,j,data[j])#寫入單元格i=i+1#往下一行f.save(str(count)+'.xls')print(str(count)+'保存成功！') def wait():"""返回隨機等待時間"""s=t.sleep(random.randint(2,9))return s def wait_l():s=t.sleep(random.randint(1,5))return s def goto_10_mian():#跳轉到10頁get_one_mian()wait_l()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[9]/a').click()#轉到第八頁#for i in range(5):#轉到第23頁wait_l()#driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[10]/a').click()#轉到第10頁 def goto_13_qing():#轉到第13頁get_one_qing()wait_l()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[9]/a').click()#轉到第八頁wait_l()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click()#十一頁wait_l()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[10]/a').click()#十三頁 def goto_20_mian():#轉到第20頁get_one_mian()wait_l()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[9]/a').click()#轉到第八頁for i in range(4):#轉到第20頁wait_l()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click() def goto_22_qing():#轉到第22頁get_one_qing()wait_l()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[9]/a').click()#轉到第八頁for i in range(4):#轉到第20頁wait_l()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click()driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[11]/a').click()#22 if __name__ == '__main__':#進入面上項目第10頁，因為已經爬到了該頁數據#進入面上項目第20頁，因為已經爬到了該頁數據#進入青年基金第13頁，因為已經爬到了該頁數據for i in range(1,10):#循環24次if i==1:save_afile(analyse_most(get_one_others()),i)elif 1<i<=8:save_afile(analyse_most(get_fore_others()),i)else:save_afile(analyse_end(get_fore_others()),i)"""if i==1:save_afile(analyse_most(get_one_mian()),i)elif 1<i<=7:wait()save_afile(analyse_most(get_fore()),i)""" #driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[13]/a').click()#點擊下一頁 #/html/body/div[3]/div[4]/div/div[3]/div[2]/p/span[15]/a 7之后（最多一次弄七個，需要等待五分鐘）

總結

以上是生活随笔為你收集整理的自然基金项目爬虫测试（已失效）的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： 2016各大公司智力题和找规律题
下一篇：计算机程序是通过专利保护,《专利审查指南