Python爬虫小实践:寻找失踪人口,爬取失踪儿童信息并写成csv文件,方便存入数据库...
前兩天有人私信我,讓我爬這個網站,http://bbs.baobeihuijia.com/f...上的失蹤兒童信息,準備根據失蹤兒童的失蹤時的地理位置來更好的尋找失蹤兒童,這種事情本就應該義不容辭,如果對網站服務器造成負荷,還請諒解。
這次依然是用第三方爬蟲包BeautifulSoup,還有Selenium+Chrome,Selenium+PhantomJS來爬取信息。
通過分析網站的框架,依然分三步來進行。
步驟一:獲取http://bbs.baobeihuijia.com/f...這個版塊上的所有分頁頁面鏈接
步驟二:獲取每一個分頁鏈接上所發的帖子的鏈接
步驟三:獲取每一個帖子鏈接上要爬取的信息,編號,姓名,性別,出生日期,失蹤時身高,失蹤時間,失蹤地點,以及是否報案
起先用的BeautifulSoup,但是被管理員設置了網站重定向,然后就采用selenium的方式,在這里還是對網站管理員說一聲抱歉。
1、獲取http://bbs.baobeihuijia.com/f...這個版塊上的所有分頁頁面鏈接
通過分析:發現分頁的頁面鏈接處于<div class="pg">下,所以寫了以下的代碼
BeautifulSoup形式:
2.獲取每一個分頁鏈接上所發的帖子的鏈接
每個帖子的鏈接都位于href下
所以寫了以下的代碼:
BeautifulSoup形式:
3.獲取每一個帖子鏈接上要爬取的信息,編號,姓名,性別,出生日期,失蹤時身高,失蹤時間,失蹤地點,以及是否報案,并寫入CSV中
通過查看每一個帖子的鏈接,發現其失蹤人口信息都在<ul>標簽下,所以編寫了以下的代碼
BeautifulSoup形式:
?
Selenium形式:
[python]?view plain?copy 1.#得到當前頁面失蹤人口信息?? 2.#pageUrl為當前帖子頁面鏈接?? 3.def?CurrentPageMissingPopulationInformation(tieziUrl):?? 4.????#設置代理IP訪問?? 5.????#代理IP可以上http://http.zhimaruanjian.com/獲取?? 6.????proxy_handler=urllib.request.ProxyHandler({'post':'128.199.169.17:80'})?? 7.????proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()?? 8.????opener?=?urllib.request.build_opener(urllib.request.HTTPHandler,?proxy_handler)?? 9.????urllib.request.install_opener(opener)?? 10.?? 11.????try:?? 12.????????#掉用第三方包selenium打開瀏覽器登陸?? 13.????????#driver=webdriver.Chrome()#打開chrome?? 14.???????driver=webdriver.Chrome()#打開無界面瀏覽器Chrome?? 15.???????#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS?? 16.???????driver.set_page_load_timeout(10)?? 17.???????#driver.implicitly_wait(30)?? 18.???????try:?? 19.???????????driver.get(tieziUrl)#登陸兩次?? 20.???????????driver.get(tieziUrl)?? 21.???????except?TimeoutError:?? 22.???????????driver.refresh()?? 23.?? 24.???????#print(driver.page_source)?? 25.???????html=driver.page_source#將瀏覽器執行后的源代碼賦給html?? 26.????????#獲取網頁信息?? 27.????#抓捕網頁解析過程中的錯誤?? 28.???????try:?? 29.???????????#req=request.Request(tieziUrl,headers=headers5)?? 30.???????????#html=urlopen(req)?? 31.???????????bsObj=BeautifulSoup(html,"html.parser")?? 32.???????????#html.close()?? 33.???????except?UnicodeDecodeError?as?e:?? 34.???????????print("-----UnicodeDecodeError?url",tieziUrl)?? 35.???????except?urllib.error.URLError?as?e:?? 36.???????????print("-----urlError?url:",tieziUrl)?? 37.???????except?socket.timeout?as?e:?? 38.???????????print("-----socket?timout:",tieziUrl)?? 39.?? 40.?? 41.???????while(bsObj.find('title').get_text()?==?"頁面重載開啟"):?? 42.???????????print("當前頁面不是重加載后的頁面,程序會嘗試刷新一次到跳轉后的頁面\n")?? 43.???????????driver.get(tieziUrl)?? 44.???????????html=driver.page_source#將瀏覽器執行后的源代碼賦給html?? 45.???????????bsObj=BeautifulSoup(html,"html.parser")?? 46.????except?Exception?as?e:?? 47.????????driver.close()?#?Close?the?current?window.?? 48.????????driver.quit()#關閉chrome瀏覽器?? 49.????????time.sleep(0.5)?? 50.?? 51.????driver.close()?#?Close?the?current?window.?? 52.????driver.quit()#關閉chrome瀏覽器?? 53.?? 54.?? 55.????#查找想要的信息?? 56.????templist1=bsObj.find("td",{"class":"t_f"}).ul?? 57.????if?templist1==None:#判斷是否不包含ul字段,如果不,跳出函數?? 58.????????print("當前帖子頁面不包含ul字段")?? 59.????????return?1?? 60.????mycsv=['NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL']#初始化提取信息列表?? 61.????for?templist2?in?templist1.findAll("font",size=re.compile("^([0-9]+)*$")):?? 62.????????tempText=templist2.get_text()?? 63.????????#print(tempText[0:4])?? 64.????????if?"寶貝回家編號"?in?tempText[0:6]:?? 65.????????????print(tempText)?? 66.????????????index=tempText.find(":")?? 67.????????????tempText=tempText[index+1:]?? 68.????????????#mycsv.append(tempText)?? 69.????????????if?len(tempText)==0:?? 70.????????????????tempText="NULL"?? 71.????????????mycsv[0]=tempText?? 72.????????if?"尋親編號"?in?tempText[0:6]:?? 73.????????????print(tempText)?? 74.????????????index=tempText.find(":")?? 75.????????????tempText=tempText[index+1:]?? 76.????????????if?len(tempText)==0:?? 77.????????????????tempText="NULL"?? 78.????????????#mycsv.append(tempText)?? 79.????????????mycsv[0]=tempText?? 80.????????if?"登記編號"?in?tempText[0:6]:?? 81.????????????print(tempText)?? 82.????????????index=tempText.find(":")?? 83.????????????tempText=tempText[index+1:]?? 84.????????????if?len(tempText)==0:?? 85.????????????????tempText="NULL"?? 86.????????????#mycsv.append(tempText)?? 87.????????????mycsv[0]=tempText?? 88.????????if?"姓"?in?tempText[0:6]:?? 89.????????????print(tempText)?? 90.????????????index=tempText.find(":")?? 91.????????????tempText=tempText[index+1:]?? 92.????????????#mycsv.append(tempText)?? 93.????????????mycsv[1]=tempText?? 94.????????if"性"?in?tempText[0:6]:?? 95.????????????print(tempText)?? 96.????????????index=tempText.find(":")?? 97.????????????tempText=tempText[index+1:]?? 98.????????????#mycsv.append(tempText)?? 99.????????????mycsv[2]=tempText?? 100.????????if?"出生日期"?in?tempText[0:6]:?? 101.????????????print(tempText)?? 102.????????????index=tempText.find(":")?? 103.????????????tempText=tempText[index+1:]?? 104.????????????#mycsv.append(tempText)?? 105.????????????mycsv[3]=tempText?? 106.????????if?"失蹤時身高"?in?tempText[0:6]:?? 107.????????????print(tempText)?? 108.????????????index=tempText.find(":")?? 109.????????????tempText=tempText[index+1:]?? 110.????????????#mycsv.append(tempText)?? 111.????????????mycsv[4]=tempText?? 112.????????if?"失蹤時間"?in?tempText[0:6]:?? 113.????????????print(tempText)?? 114.????????????index=tempText.find(":")?? 115.????????????tempText=tempText[index+1:]?? 116.????????????#mycsv.append(tempText)?? 117.????????????mycsv[5]=tempText?? 118.????????if?"失蹤日期"?in?tempText[0:6]:?? 119.????????????print(tempText)?? 120.????????????index=tempText.find(":")?? 121.????????????tempText=tempText[index+1:]?? 122.????????????#mycsv.append(tempText)?? 123.????????????mycsv[5]=tempText?? 124.????????if?"失蹤地點"?in?tempText[0:6]:?? 125.????????????print(tempText)?? 126.????????????index=tempText.find(":")?? 127.????????????tempText=tempText[index+1:]?? 128.????????????#mycsv.append(tempText)?? 129.????????????mycsv[6]=tempText?? 130.????????if?"是否報案"?in?tempText[0:6]:?? 131.????????????print(tempText)?? 132.????????????index=tempText.find(":")?? 133.????????????tempText=tempText[index+1:]?? 134.????????????#mycsv.append(tempText)?? 135.????????????mycsv[7]=tempText?? 136.????try:?? 137.????????writer.writerow((str(mycsv[0]),str(mycsv[1]),str(mycsv[2]),str(mycsv[3]),str(mycsv[4]),str(mycsv[5]),str(mycsv[6]),str(mycsv[7])))#寫入CSV文件?? 138.????????csvfile.flush()#馬上將這條數據寫入csv文件中?? 139.????finally:?? 140.????????print("當前帖子信息寫入完成\n")?? 141.????????time.sleep(5)#設置爬完之后的睡眠時間,這里先設置為1秒??現附上所有代碼,此代碼僅供參考,不能用于商業用途,網絡爬蟲易給網站服務器造成巨大負荷,任何人使用本代碼所引起的任何后果,本人不予承擔法律責任。貼出代碼的初衷是供大家學習爬蟲,大家只是研究下網絡框架即可,不要使用此代碼去加重網站負荷,本人由于不當使用,已被封IP,前車之鑒,爬取失蹤人口信息只是為了從空間上分析人口失蹤的規律,由此給網站造成的什么不便,請見諒。
附上所有代碼:
[python]?view plain?copy 1.#__author__?=?'Administrator'?? 2.#coding=utf-8?? 3.import?io?? 4.import?os?? 5.import?sys?? 6.import?math?? 7.import?urllib?? 8.from?urllib.request?import??urlopen?? 9.from?urllib.request?import?urlretrieve?? 10.from?urllib??import?request?? 11.from?bs4?import?BeautifulSoup?? 12.import?re?? 13.import?time?? 14.import?socket?? 15.import?csv?? 16.from?selenium?import?webdriver?? 17.?? 18.socket.setdefaulttimeout(5000)#設置全局超時函數?? 19.?? 20.?? 21.?? 22.sys.stdout?=?io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')?? 23.#sys.stdout?=?io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')?? 24.#設置不同的headers,偽裝為不同的瀏覽器?? 25.headers1={'User-Agent':'Mozilla/5.0?(Windows?NT?6.1;?WOW64;?rv:23.0)?Gecko/20100101?Firefox/23.0'}?? 26.headers2={'User-Agent':'Mozilla/5.0?(Windows?NT?6.3;?WOW64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/45.0.2454.101?Safari/537.36'}?? 27.headers3={'User-Agent':'Mozilla/5.0?(Windows?NT?6.1)?AppleWebKit/537.11?(KHTML,?like?Gecko)?Chrome/23.0.1271.64?Safari/537.11'}?? 28.headers4={'User-Agent':'Mozilla/5.0?(Windows?NT?10.0;?WOW64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/53.0.2785.104?Safari/537.36?Core/1.53.2372.400?QQBrowser/9.5.10548.400'}?? 29.headers5={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',?? 30.'Connection':'keep-alive',?? 31.'Host':'bbs.baobeihuijia.com',?? 32.'Referer':'http://bbs.baobeihuijia.com/forum-191-1.html',?? 33.'Upgrade-Insecure-Requests':'1',?? 34.'User-Agent':'Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/51.0.2704.103?Safari/537.36'}?? 35.?? 36.headers6={'Host':?'bbs.baobeihuijia.com',?? 37.'User-Agent':?'Mozilla/5.0?(Windows?NT?6.1;?WOW64;?rv:51.0)?Gecko/20100101?Firefox/51.0',?? 38.'Accept':?'textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',?? 39.'Connection':?'keep-alive',?? 40.'Upgrade-Insecure-Requests':'?1'?? 41.}?? 42.#得到當前頁面失蹤人口信息?? 43.#pageUrl為當前帖子頁面鏈接?? 44.def?CurrentPageMissingPopulationInformation(tieziUrl):?? 45.????#設置代理IP訪問?? 46.????#代理IP可以上http://http.zhimaruanjian.com/獲取?? 47.????proxy_handler=urllib.request.ProxyHandler({'post':'128.199.169.17:80'})?? 48.????proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()?? 49.????opener?=?urllib.request.build_opener(urllib.request.HTTPHandler,?proxy_handler)?? 50.????urllib.request.install_opener(opener)?? 51.?? 52.????try:?? 53.????????#掉用第三方包selenium打開瀏覽器登陸?? 54.????????#driver=webdriver.Chrome()#打開chrome?? 55.???????driver=webdriver.Chrome()#打開無界面瀏覽器Chrome?? 56.???????#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS?? 57.???????driver.set_page_load_timeout(10)?? 58.???????#driver.implicitly_wait(30)?? 59.???????try:?? 60.???????????driver.get(tieziUrl)#登陸兩次?? 61.???????????driver.get(tieziUrl)?? 62.???????except?TimeoutError:?? 63.???????????driver.refresh()?? 64.?? 65.???????#print(driver.page_source)?? 66.???????html=driver.page_source#將瀏覽器執行后的源代碼賦給html?? 67.????????#獲取網頁信息?? 68.????#抓捕網頁解析過程中的錯誤?? 69.???????try:?? 70.???????????#req=request.Request(tieziUrl,headers=headers5)?? 71.???????????#html=urlopen(req)?? 72.???????????bsObj=BeautifulSoup(html,"html.parser")?? 73.???????????#html.close()?? 74.???????except?UnicodeDecodeError?as?e:?? 75.???????????print("-----UnicodeDecodeError?url",tieziUrl)?? 76.???????except?urllib.error.URLError?as?e:?? 77.???????????print("-----urlError?url:",tieziUrl)?? 78.???????except?socket.timeout?as?e:?? 79.???????????print("-----socket?timout:",tieziUrl)?? 80.?? 81.?? 82.???????while(bsObj.find('title').get_text()?==?"頁面重載開啟"):?? 83.???????????print("當前頁面不是重加載后的頁面,程序會嘗試刷新一次到跳轉后的頁面\n")?? 84.???????????driver.get(tieziUrl)?? 85.???????????html=driver.page_source#將瀏覽器執行后的源代碼賦給html?? 86.???????????bsObj=BeautifulSoup(html,"html.parser")?? 87.????except?Exception?as?e:?? 88.????????driver.close()?#?Close?the?current?window.?? 89.????????driver.quit()#關閉chrome瀏覽器?? 90.????????time.sleep(0.5)?? 91.?? 92.????driver.close()?#?Close?the?current?window.?? 93.????driver.quit()#關閉chrome瀏覽器?? 94.?? 95.?? 96.????#查找想要的信息?? 97.????templist1=bsObj.find("td",{"class":"t_f"}).ul?? 98.????if?templist1==None:#判斷是否不包含ul字段,如果不,跳出函數?? 99.????????print("當前帖子頁面不包含ul字段")?? 100.????????return?1?? 101.????mycsv=['NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL']#初始化提取信息列表?? 102.????for?templist2?in?templist1.findAll("font",size=re.compile("^([0-9]+)*$")):?? 103.????????tempText=templist2.get_text()?? 104.????????#print(tempText[0:4])?? 105.????????if?"寶貝回家編號"?in?tempText[0:6]:?? 106.????????????print(tempText)?? 107.????????????index=tempText.find(":")?? 108.????????????tempText=tempText[index+1:]?? 109.????????????#mycsv.append(tempText)?? 110.????????????if?len(tempText)==0:?? 111.????????????????tempText="NULL"?? 112.????????????mycsv[0]=tempText?? 113.????????if?"尋親編號"?in?tempText[0:6]:?? 114.????????????print(tempText)?? 115.????????????index=tempText.find(":")?? 116.????????????tempText=tempText[index+1:]?? 117.????????????if?len(tempText)==0:?? 118.????????????????tempText="NULL"?? 119.????????????#mycsv.append(tempText)?? 120.????????????mycsv[0]=tempText?? 121.????????if?"登記編號"?in?tempText[0:6]:?? 122.????????????print(tempText)?? 123.????????????index=tempText.find(":")?? 124.????????????tempText=tempText[index+1:]?? 125.????????????if?len(tempText)==0:?? 126.????????????????tempText="NULL"?? 127.????????????#mycsv.append(tempText)?? 128.????????????mycsv[0]=tempText?? 129.????????if?"姓"?in?tempText[0:6]:?? 130.????????????print(tempText)?? 131.????????????index=tempText.find(":")?? 132.????????????tempText=tempText[index+1:]?? 133.????????????#mycsv.append(tempText)?? 134.????????????mycsv[1]=tempText?? 135.????????if"性"?in?tempText[0:6]:?? 136.????????????print(tempText)?? 137.????????????index=tempText.find(":")?? 138.????????????tempText=tempText[index+1:]?? 139.????????????#mycsv.append(tempText)?? 140.????????????mycsv[2]=tempText?? 141.????????if?"出生日期"?in?tempText[0:6]:?? 142.????????????print(tempText)?? 143.????????????index=tempText.find(":")?? 144.????????????tempText=tempText[index+1:]?? 145.????????????#mycsv.append(tempText)?? 146.????????????mycsv[3]=tempText?? 147.????????if?"失蹤時身高"?in?tempText[0:6]:?? 148.????????????print(tempText)?? 149.????????????index=tempText.find(":")?? 150.????????????tempText=tempText[index+1:]?? 151.????????????#mycsv.append(tempText)?? 152.????????????mycsv[4]=tempText?? 153.????????if?"失蹤時間"?in?tempText[0:6]:?? 154.????????????print(tempText)?? 155.????????????index=tempText.find(":")?? 156.????????????tempText=tempText[index+1:]?? 157.????????????#mycsv.append(tempText)?? 158.????????????mycsv[5]=tempText?? 159.????????if?"失蹤日期"?in?tempText[0:6]:?? 160.????????????print(tempText)?? 161.????????????index=tempText.find(":")?? 162.????????????tempText=tempText[index+1:]?? 163.????????????#mycsv.append(tempText)?? 164.????????????mycsv[5]=tempText?? 165.????????if?"失蹤地點"?in?tempText[0:6]:?? 166.????????????print(tempText)?? 167.????????????index=tempText.find(":")?? 168.????????????tempText=tempText[index+1:]?? 169.????????????#mycsv.append(tempText)?? 170.????????????mycsv[6]=tempText?? 171.????????if?"是否報案"?in?tempText[0:6]:?? 172.????????????print(tempText)?? 173.????????????index=tempText.find(":")?? 174.????????????tempText=tempText[index+1:]?? 175.????????????#mycsv.append(tempText)?? 176.????????????mycsv[7]=tempText?? 177.????try:?? 178.????????writer.writerow((str(mycsv[0]),str(mycsv[1]),str(mycsv[2]),str(mycsv[3]),str(mycsv[4]),str(mycsv[5]),str(mycsv[6]),str(mycsv[7])))#寫入CSV文件?? 179.????????csvfile.flush()#馬上將這條數據寫入csv文件中?? 180.????finally:?? 181.????????print("當前帖子信息寫入完成\n")?? 182.????????time.sleep(5)#設置爬完之后的睡眠時間,這里先設置為1秒?? 183.?? 184.?? 185.#得到當前板塊所有的頁面鏈接?? 186.#siteUrl為當前版塊的頁面鏈接?? 187.def?GetALLPageUrl(siteUrl):?? 188.????#設置代理IP訪問?? 189.????#代理IP可以上http://http.zhimaruanjian.com/獲取?? 190.????proxy_handler=urllib.request.ProxyHandler({'post':'123.207.143.51:8080'})?? 191.????proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()?? 192.????opener?=?urllib.request.build_opener(urllib.request.HTTPHandler,?proxy_handler)?? 193.????urllib.request.install_opener(opener)?? 194.?? 195.????try:?? 196.????????#掉用第三方包selenium打開瀏覽器登陸?? 197.????????#driver=webdriver.Chrome()#打開chrome?? 198.???????driver=webdriver.Chrome()#打開無界面瀏覽器Chrome?? 199.???????#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS?? 200.???????driver.set_page_load_timeout(10)?? 201.???????#driver.implicitly_wait(30)?? 202.???????try:?? 203.???????????driver.get(siteUrl)#登陸兩次?? 204.???????????driver.get(siteUrl)?? 205.???????except?TimeoutError:?? 206.???????????driver.refresh()?? 207.?? 208.???????#print(driver.page_source)?? 209.???????html=driver.page_source#將瀏覽器執行后的源代碼賦給html?? 210.????????#獲取網頁信息?? 211.????#抓捕網頁解析過程中的錯誤?? 212.???????try:?? 213.???????????#req=request.Request(tieziUrl,headers=headers5)?? 214.???????????#html=urlopen(req)?? 215.???????????bsObj=BeautifulSoup(html,"html.parser")?? 216.???????????#print(bsObj.find('title').get_text())?? 217.???????????#html.close()?? 218.???????except?UnicodeDecodeError?as?e:?? 219.???????????print("-----UnicodeDecodeError?url",siteUrl)?? 220.???????except?urllib.error.URLError?as?e:?? 221.???????????print("-----urlError?url:",siteUrl)?? 222.???????except?socket.timeout?as?e:?? 223.???????????print("-----socket?timout:",siteUrl)?? 224.?? 225.?? 226.?? 227.???????while(bsObj.find('title').get_text()?==?"頁面重載開啟"):?? 228.???????????print("當前頁面不是重加載后的頁面,程序會嘗試刷新一次到跳轉后的頁面\n")?? 229.???????????driver.get(siteUrl)?? 230.???????????html=driver.page_source#將瀏覽器執行后的源代碼賦給html?? 231.???????????bsObj=BeautifulSoup(html,"html.parser")?? 232.????except?Exception?as?e:?? 233.?? 234.????????driver.close()?#?Close?the?current?window.?? 235.????????driver.quit()#關閉chrome瀏覽器?? 236.????????#time.sleep()?? 237.?? 238.????driver.close()?#?Close?the?current?window.?? 239.????driver.quit()#關閉chrome瀏覽器?? 240.?? 241.?? 242.????#http://bbs.baobeihuijia.com/forum-191-1.html變成http://bbs.baobeihuijia.com,以便組成頁面鏈接?? 243.????siteindex=siteUrl.rfind("/")?? 244.????tempsiteurl=siteUrl[0:siteindex+1]#http://bbs.baobeihuijia.com/?? 245.????tempbianhaoqian=siteUrl[siteindex+1:-6]#forum-191-?? 246.?? 247.????#爬取想要的信息?? 248.????bianhao=[]#存儲頁面編號?? 249.????pageUrl=[]#存儲頁面鏈接?? 250.?? 251.????templist1=bsObj.find("div",{"class":"pg"})?? 252.????#if?templist1==None:?? 253.????????#return?? 254.????for?templist2?in?templist1.findAll("a",href=re.compile("forum-([0-9]+)-([0-9]+).html")):?? 255.????????if?templist2==None:?? 256.????????????continue?? 257.????????lianjie=templist2.attrs['href']?? 258.????????#print(lianjie)?? 259.????????index1=lianjie.rfind("-")#查找-在字符串中的位置?? 260.????????index2=lianjie.rfind(".")#查找.在字符串中的位置?? 261.????????tempbianhao=lianjie[index1+1:index2]?? 262.????????bianhao.append(int(tempbianhao))?? 263.????bianhaoMax=max(bianhao)#獲取頁面的最大編號?? 264.?? 265.????for?i?in?range(1,bianhaoMax+1):?? 266.????????temppageUrl=tempsiteurl+tempbianhaoqian+str(i)+".html"#組成頁面鏈接?? 267.????????print(temppageUrl)?? 268.????????pageUrl.append(temppageUrl)?? 269.????return?pageUrl#返回頁面鏈接列表?? 270.?? 271.#得到當前版塊頁面所有帖子的鏈接?? 272.def?GetCurrentPageTieziUrl(PageUrl):?? 273.????#設置代理IP訪問?? 274.????#代理IP可以上http://http.zhimaruanjian.com/獲取?? 275.????proxy_handler=urllib.request.ProxyHandler({'post':'110.73.30.157:8123'})?? 276.????proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()?? 277.????opener?=?urllib.request.build_opener(urllib.request.HTTPHandler,?proxy_handler)?? 278.????urllib.request.install_opener(opener)?? 279.?? 280.????try:?? 281.????????#掉用第三方包selenium打開瀏覽器登陸?? 282.????????#driver=webdriver.Chrome()#打開chrome?? 283.???????driver=webdriver.Chrome()#打開無界面瀏覽器Chrome?? 284.???????#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS?? 285.???????driver.set_page_load_timeout(10)?? 286.???????try:?? 287.???????????driver.get(PageUrl)#登陸兩次?? 288.???????????driver.get(PageUrl)?? 289.???????except?TimeoutError:?? 290.???????????driver.refresh()?? 291.?? 292.???????#print(driver.page_source)?? 293.???????html=driver.page_source#將瀏覽器執行后的源代碼賦給html?? 294.????????#獲取網頁信息?? 295.????#抓捕網頁解析過程中的錯誤?? 296.???????try:?? 297.???????????#req=request.Request(tieziUrl,headers=headers5)?? 298.???????????#html=urlopen(req)?? 299.???????????bsObj=BeautifulSoup(html,"html.parser")?? 300.???????????#html.close()?? 301.???????except?UnicodeDecodeError?as?e:?? 302.???????????print("-----UnicodeDecodeError?url",PageUrl)?? 303.???????except?urllib.error.URLError?as?e:?? 304.???????????print("-----urlError?url:",PageUrl)?? 305.???????except?socket.timeout?as?e:?? 306.???????????print("-----socket?timout:",PageUrl)?? 307.?? 308.???????n=0?? 309.???????while(bsObj.find('title').get_text()?==?"頁面重載開啟"):?? 310.???????????print("當前頁面不是重加載后的頁面,程序會嘗試刷新一次到跳轉后的頁面\n")?? 311.???????????driver.get(PageUrl)?? 312.???????????html=driver.page_source#將瀏覽器執行后的源代碼賦給html?? 313.???????????bsObj=BeautifulSoup(html,"html.parser")?? 314.???????????n=n+1?? 315.???????????if?n==10:?? 316.???????????????driver.close()?#?Close?the?current?window.?? 317.???????????????driver.quit()#關閉chrome瀏覽器?? 318.???????????????return?1?? 319.?? 320.????except?Exception?as?e:?? 321.????????driver.close()?#?Close?the?current?window.?? 322.????????driver.quit()#關閉chrome瀏覽器?? 323.????????time.sleep(1)?? 324.?? 325.????driver.close()?#?Close?the?current?window.?? 326.????driver.quit()#關閉chrome瀏覽器?? 327.?? 328.?? 329.????#http://bbs.baobeihuijia.com/forum-191-1.html變成http://bbs.baobeihuijia.com,以便組成帖子鏈接?? 330.????siteindex=PageUrl.rfind("/")?? 331.????tempsiteurl=PageUrl[0:siteindex+1]#http://bbs.baobeihuijia.com/?? 332.????#print(tempsiteurl)?? 333.????TieziUrl=[]?? 334.????#爬取想要的信息?? 335.????for?templist1?in?bsObj.findAll("tbody",id=re.compile("normalthread_([0-9]+)"))?:?? 336.????????if?templist1==None:?? 337.????????????continue?? 338.????????for?templist2?in?templist1.findAll("a",{"class":"s?xst"}):?? 339.????????????if?templist2==None:?? 340.????????????????continue?? 341.????????????tempteiziUrl=tempsiteurl+templist2.attrs['href']#組成帖子鏈接?? 342.????????????print(tempteiziUrl)?? 343.????????????TieziUrl.append(tempteiziUrl)?? 344.????return?TieziUrl#返回帖子鏈接列表?? 345.?? 346.?? 347.?? 348.#CurrentPageMissingPopulationInformation("http://bbs.baobeihuijia.com/thread-213126-1-1.html")?? 349.#GetALLPageUrl("http://bbs.baobeihuijia.com/forum-191-1.html")?? 350.#GetCurrentPageTieziUrl("http://bbs.baobeihuijia.com/forum-191-1.html")?? 351.?? 352.if?__name__?==?'__main__':?? 353.????csvfile=open("E:/MissingPeople.csv","w+",newline="",encoding='gb18030')?? 354.????writer=csv.writer(csvfile)?? 355.????writer.writerow(('寶貝回家編號','姓名','性別','出生日期','失蹤時身高','失蹤時間','失蹤地點','是否報案'))?? 356.????pageurl=GetALLPageUrl("https://bbs.baobeihuijia.com/forum-191-1.html")#尋找失蹤寶貝?? 357.????#pageurl=GetALLPageUrl("http://bbs.baobeihuijia.com/forum-189-1.html")#被拐寶貝回家?? 358.????time.sleep(5)?? 359.????print("所有頁面鏈接獲取成功!\n")?? 360.????n=0?? 361.????for?templist1?in?pageurl:?? 362.????????#print(templist1)?? 363.????????tieziurl=GetCurrentPageTieziUrl(templist1)?? 364.????????time.sleep(5)?? 365.????????print("當前頁面"+str(templist1)+"所有帖子鏈接獲取成功!\n")?? 366.????????if?tieziurl?==1:?? 367.????????????print("不能得到當前帖子頁面!\n")?? 368.????????????continue?? 369.????????else:?? 370.????????????for?templist2?in?tieziurl:?? 371.????????????#print(templist2)?? 372.???????????????n=n+1?? 373.???????????????print("\n正在收集第"+str(n)+"條信息!")?? 374.???????????????time.sleep(5)?? 375.???????????????tempzhi=CurrentPageMissingPopulationInformation(templist2)?? 376.???????????????if?tempzhi==1:?? 377.??????????????????print("\n第"+str(n)+"條信息為空!")?? 378.??????????????????continue?? 379.????print('')?? 380.????print("信息爬取完成!請放心的關閉程序!")?? 381.????csvfile.close()??寫成的CSV文件截圖:
總結
以上是生活随笔為你收集整理的Python爬虫小实践:寻找失踪人口,爬取失踪儿童信息并写成csv文件,方便存入数据库...的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: SpringMVC的Controller
- 下一篇: ASPxGridView EditFor