import re
import requests
import bs4def fetchUrl(url):'''功能:根據參數 url ,發起 http request,嘗試獲取指定網頁并返回結果參數:url:某個 webpage 的url返回:類文件對象型 http Response 對象'''headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}try:r = requests.get(url, headers=headers)r.raise_for_status()r.encoding = r.apparent_encodingprint('success!')return r.textexcept requests.RequestError as e:print(e)except:return "Error!"def parserHtml(html,urating):'''功能:根據參數 html 給定的內存型 HTML 文件,嘗試解析其結構,獲取所需內容參數:html:類似文件的內存 HTML 文本對象urating:一個二維列表,存放著大學排名信息返回:一個二維列表,存放著大學排名信息'''bsobj = bs4.BeautifulSoup(html,'html.parser')# 獲取表頭信息tr = bsobj.find('thead').find('tr')hlist = []if isinstance(tr, bs4.element.Tag):for th in tr('th'):hlist.append(th.string)hlist.pop()for option in tr('option'):hlist.append(option.string)urating.append(hlist)# 獲取表體信息for tr in bsobj.find('tbody').children:blist = []if isinstance(tr, bs4.element.Tag):for td in tr('td'):blist.append(td.string)urating.append(blist)return uratingdef output(urating, filename):'''功能:格式化輸出結果參數:urating:存放著排名結果的二維列表filename:保存的文件名返回:無'''import pandas as pddataframe = pd.DataFrame(urating)dataframe.to_csv(filename, index=False, sep=',', header=False)print("Success!")def main():url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html'print("Begin to crawl the http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html and get the rating of universities in china ...")print('---'*20)print("Try to fetch url ...")html = fetchUrl(url)print("Try to parser html ...")urating = []ur = parserHtml(html,urating)print("Try to save the results in file ...")output(ur, '大學排名2018.csv')print("The work of crawling is done.")if __name__ == '__main__':main()
Begin to crawl the http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html and get the rating of universities in china ...
------------------------------------------------------------
Try to fetch url ...
success!
Try to parser html ...
Try to save the results in file ...
Success!
The work of crawling is done.