因為工作的關系,我寫過許多個抓取網站信息的程序。 最簡單的,只要用Python的urllib2.urlopen()函數就可以了; 然后,有個網站喜歡封人,所以,得找一批代理,輪流抓它的信息; 有的網站不允許程序抓取,所以,就得加入一些頭信息; 有的網站需要登錄,這時就要用到Cookies; 最后,為了提高效率,最好是使用多線程。(PS,有個地方要注意,urlopen這個函數,設定了一個全局對象opener,所以如果你使用了多個線程, 每個線程使用一個代理,那么,不能使用urlopen這個函數,而應該使用opener.open) 下面是我用Python寫的一個抓代理的腳本,雖然現在已經不在教育網內部了,不過有時候還是需要用一下代理的:) # -*- coding: cp936 -*- import urllib2,re,thread,time
import socket socket.setdefaulttimeout(10) ?? #-----------------------定義抓取代理的函數-------------------------------# def getcnproxy(name): ??? pagenum=0 ??? result=[] ??? getallpages=0 ??? trycount=0 ??? while getallpages==0 and trycount<=6: ??????? pagenum=pagenum+1 ??????? url='http://www.proxycn.com/html_proxy/http-'+str(pagenum)+'.html' ??????? try: ??????????? html=urllib2.urlopen(url) ??????????? ip='' ??????????? for line in html: ??????????????? if '''onDblClick="clip''' in line: ??????????????????? proxy=line[line.find("clip('")+6:line.find("')")] ??????????????????? lock.acquire() ??????????????????? print name,proxy ??????????????????? lock.release() ??????????????????? result.append(proxy) ??????????????? if '下一頁|尾頁' in line: ??????????????????? getallpages=1 ??????? except: ??????????? trycount=trycount+1 ??????????? pagenum=pagenum-1 ??? proxylist[0]=result ??? return result def getproxycn(name): ??? pagenum=0 ??? result=[] ??? getallpages=0 ??? trycount=0 ??? while pagenum<=9 and trycount<=2: ??????? pagenum=pagenum+1 ??????? url='http://www.cnproxy.com/proxy'+str(pagenum)+'.html' ??????? try: ??????????? html=urllib2.urlopen(url) ??????????? for line in html: ??????????????? if "HTTP" in line: ??????????????????? proxy=line[line.find('<td>')+4:line.find('̴')]+line[line.find(':'):line.find('</td><td>')] ??????????????????? lock.acquire() ??????????????????? print name,proxy ??????????????????? lock.release() ??????????????????? result.append(proxy) ??????? except: ??????????? trycount=trycount+1 ??????????? pagenum=pagenum-1 ??? proxylist[1]=result ??? return result ?? #------------------------- --------------- 結束代理抓取函數定義 --------------------------------------------------# #------------------------------------------ 驗證代理的函數定義 ---------------------------------------------------# def proxycheckone(proxy): ??? url='http://www.facebook.com' ??? proxy_url = 'http://'+proxy ??? proxy_support = urllib2.ProxyHandler({'http': proxy_url}) ??? opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) ??? r=urllib2.Request(url) ??? r.add_header("Accept-Language","zh-cn")??? #加入頭信息,這樣可以避免403錯誤 ??? r.add_header("Content-Type","text/html; charset=gb2312") ??? r.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)") ??? trycount=1 ??? while trycount<=2: ??????? try: ??????????? T0=time.time() ??????????? f=opener.open(r) ??????????? data=f.read() ??????????? if 'Welcome to Facebook!' in data: ??????????????? T=time.time()-T0????????????? ??????????????? break ??????????? else:return [] ??????? except: ??????????? time.sleep(3) ??????????? trycount=trycount+1 ??? if trycount>2: ??????? return [] ??? else: ??????? return proxy+'$'+str(trycount)+'#'+str(T) def proxycheck(idnum): ??? while 1: ??????? r.acquire() ??????? try: ??????????? i=proxylist[0] ??????????? del proxylist[0] ??????????? r.release() ??????? except: ??????????? r.release() ??????????? x[idnum]=1 ??????????? break ??????? b=proxycheckone(i) ??????? if len(b)>0: ??????????? a.acquire() ??????????? y.append(b) ??????????? a.release() #---------------------------------------- 驗證代理的函數定義結束 -------------------------------------------------# #----------------------------- 抓取代理,抓取到的代理放在proxies.txt中,以\n分隔 --------------------------------# #x=''' lock=thread.allocate_lock() proxylist=[[],[]] thread.start_new(getcnproxy,('cnproxy',)) thread.start_new(getproxycn,('proxycn',)) while [] in proxylist: ??? time.sleep(30) proxylist=proxylist[0]+proxylist[1] w=open('proxies.txt','a') w.write('\n'.join(proxylist)) w.close() del proxylist print 'get all proxies!\n\n' #''' #----------------------------- 抓取代理完畢,抓取到的代理放在proxies.txt中,以\n分隔 -------------------------------# #--------------------------------------------------- 驗證代理 -----------------------------------------------------# w=open('proxies.txt') proxylist=list(set((re.sub(r'(\t+[^\n]*\n|\n)',',',w.read())).split(','))) while '' in proxylist: ??? del proxylist[proxylist.index('')] w.close() lock=thread.allocate_lock() r=thread.allocate_lock() a=thread.allocate_lock() y=[] x=[0]*120 for idnum in range(0,120): ??? thread.start_new(proxycheck,(idnum,)) while 0 in x: ??? print len(proxylist),sum(x),"left",len(y) ??? time.sleep(10) w=open('proxies.txt','w') w.write(re.sub('^\n','',re.sub(r'\n+','\n','\n'.join(y)+'\n'))) w.close() #-------------------------------------------------- 驗證代理完畢 --------------------------------------------------# |