在Ubuntu 14.04 64bit上使用pycURL模块示例
PycURL 傳說是實現Python下多線程網頁抓取的效率最高的解決方案,本質是對libcurl C語言庫的封裝。
在Linux上有個常用的命令 curl(非常好用),支持curl的就是大名鼎鼎的libcurl庫;libcurl是功能強大的,而且是非常高效的函數庫。libcurl除了提供本身的C API之外,還有多達40種編程語言的Binding,這里介紹的PycURL就是libcurl的Python binding。
在Python中對網頁進行GET/POST等請求,當需要考慮高性能的時候,libcurl是非常不錯的選擇,一般來說會比liburl、liburl2快不少,可能也會比Requests的效率更高。特別是使用PycURL的多并發請求時,更是效率很高的。個人感覺,其唯一的缺點是,由于是直接調用的是libcurl C庫,PycURL的函數接口之類的還和C中的東西很像,可能不是那么的Pythonic,寫代碼的學習曲線稍微比liburl高一點兒。
https://github.com/pycurl/pycurl? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?//pycurl模塊的源碼 搭建好sphinx環境之后,文檔生成,直接從git源碼下面運行,make docs 就會在build/doc中看到相關文檔信息,進去之后,直接點擊index.html進行查看
下面是我的幾個實踐示例
1.最簡單的網頁獲取
#!/usr/bin/env python
#-*- coding: utf-8 -*-import sys, pycurl, time, cStringIOsys.stderr.write("pycURL version [%s]\n" % pycurl.version)start_time = time.time()url = 'http://www.dianping.com/shanghai'
b = cStringIO.StringIO()
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEFUNCTION, b.write)
c.perform()
end_time = time.time()content = b.getvalue()duration = end_time - start_time
print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
c.close()print 'pycurl takes [%s] seconds to get [%s]' % (duration, url)
print 'length of the content is [%d]' % len(content)
2.簡單的pycURL包裝類
#!/usr/bin/env python
#encoding: utf-8import sys, pycurl, cStringIO, urllibclass Curl:def __init__(self):self.c = pycurl.Curl()def __del__(self):self.c.close()def init(self, verbose):c = self.c;c.setopt(c.FOLLOWLOCATION, 1)c.setopt(c.MAXREDIRS, 5)c.setopt(c.CONNECTTIMEOUT, 30)c.setopt(c.TIMEOUT, 300)c.setopt(c.NOSIGNAL, 1)c.setopt(c.USERAGENT, "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36")c.setopt(c.VERBOSE, verbose)def get(self, url):b = cStringIO.StringIO()c = self.c;c.setopt(c.URL, url)c.setopt(c.WRITEFUNCTION, b.write)c.perform()content = b.getvalue()print "HTTP CODE: ", c.getinfo(c.HTTP_CODE)b.close()return contentdef post(self, url, data):b = cStringIO.StringIO()c = self.c;c.setopt(c.POSTFIELDS, urllib.urlencode(data))c.setopt(c.URL, url)c.setopt(c.WRITEFUNCTION, b.write)c.perform()content = b.getvalue()print "HTTP CODE: ", c.getinfo(c.HTTP_CODE)b.close()return contentdef purge(self, url):cmd = 'PURGE 'proxy = '127.0.0.1:8080'c = self.cc.setopt(c.URL, url)c.setopt(c.PROXY, proxy)c.setopt(c.CUSTOMREQUEST, cmd)c.perform()status = c.getinfo(c.HTTP_CODE)print "HTTP CODE: ", statusreturn statusif __name__ == '__main__':page = 'http://news.sohu.com/'c = Curl()c.init(True)c.get(page)page1 = 'http://www.google.com/'post_data_dic = {"name":"value"}c.post(page1, post_data_dic)page2 = 'http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg'c.purge(page2)
3.簡單的pycURL multi類包裝
#!/usr/bin/env python
#encoding: utf-8import sys, pycurl, cStringIOclass MCurl:def __init__(self, tasks, concurrent):self.taskQ = tasksself.taskQ_size = len(tasks)self.max_conn = concurrentself.resp_dict = {}self.m = pycurl.CurlMulti()def __del__(self):self.m.close()def add_tasks(self):self.max_conn = min(self.taskQ_size, self.max_conn)assert 1 <= self.max_conn <= 100, "invalid number of concurrent urls"print "===Getting %d urls using %d concurrent cURL handle pool===" % (self.taskQ_size, self.max_conn)self.m.handles = []for i in range(self.max_conn):c = pycurl.Curl()c.fp = Nonec.setopt(pycurl.FOLLOWLOCATION, 1)c.setopt(pycurl.MAXREDIRS, 5)c.setopt(pycurl.CONNECTTIMEOUT, 30)c.setopt(pycurl.TIMEOUT, 300)c.setopt(pycurl.NOSIGNAL, 1)self.m.handles.append(c)self.resp_dict['total'] = self.taskQ_sizeself.resp_dict['succ'] = []self.resp_dict['fail'] = []def process_tasks(self):freelist = self.m.handles[:]queue = self.taskQnum_processed = 0while num_processed < self.taskQ_size:#if there is an url to process and a free curl handle, add to multi stackwhile queue and freelist:url, filename = queue.pop(0)c = freelist.pop()c.fp = open(filename, "wb")c.setopt(pycurl.URL, url)c.setopt(pycurl.WRITEDATA, c.fp)self.m.add_handle(c)#store some info for use laterc.filename = filenamec.url = url#run the internal curl state machine for the multi stackwhile 1:ret, num_handles = self.m.perform()if ret != pycurl.E_CALL_MULTI_PERFORM:break#check if curl handle has terminated, and add them to the freelistwhile 1:num_q, ok_list, err_list = self.m.info_read()for c in ok_list:c.fp.close()c.fp = Noneself.resp_dict['succ'].append(c.url)self.m.remove_handle(c)print ("Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL))freelist.append(c)for c, errno, errmsg in err_list:c.fp.close()c.fp = Noneself.resp_dict['fail'].append(c.url)self.m.remove_handle(c)print("Failed: ", c.filename, c.url, errno, errmsg)freelist.append(c)num_processed = num_processed + len(ok_list) + len(err_list)if num_q == 0:break;#currently no more I/O is pending, we just call select() to sleep until some more data is availableself.m.select(1.0)def del_tasks(self):for c in self.m.handles:if c.fp is not None:c.fp.close()c.fp = Nonec.close()def dump_process(self):print self.resp_dict#========= main entry point ==========
#give tasks info
urls = ["http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg",
"http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg", "", "http://m2.biz.itc.cn/pic/new/n/93/87/Img7798793_n.jpg",
"http://m1.biz.itc.cn/pic/new/n/92/87/Img7798792_n.jpg", "http://m3.biz.itc.cn/pic/new/n/94/91/Img7799194_n.jpg",
"http://m1.biz.itc.cn/pic/new/n/96/87/Img7798796_n.jpg", "http://m2.biz.itc.cn/pic/new/n/97/87/Img7798797_n.jpg",
"http://m1.biz.itc.cn/pic/new/n/16/88/Img7798816_n.jpg", "http://m2.biz.itc.cn/pic/new/n/17/88/Img7798817_n.jpg",
"http://m4.biz.itc.cn/pic/new/n/95/87/Img7798795_n.jpg", "http://m4.biz.itc.cn/pic/new/n/91/91/Img7799191_n.jpg"]concurr = 6
queue = []
for url in urls:url = url.strip()if not url or url[0] == "#":continuefilename = "./sohu_%03d.jpg" % (len(queue) + 1)queue.append((url, filename))mc = MCurl(queue, concurr)
mc.add_tasks()
mc.process_tasks()
mc.del_tasks()
mc.dump_process()
運行截圖
4.PURGE等自定義請求實現
#!/usr/bin/env python
#encoding: utf-8import sys, pycurl, cStringIO, urlliburl = 'http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg'
cmd = 'PURGE '
#cmd = 'DELETE '
proxy = '127.0.0.1:8080'
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.VERBOSE, 1)
c.setopt(c.PROXY, proxy)
c.setopt(c.CUSTOMREQUEST, cmd)
try:c.perform()
except Exception as e:print e
status = c.getinfo(c.HTTP_CODE)
print "HTTP CODE: ", status
c.close()
運行截圖
說明:
1.使用post表單時,只需要設置
c.setopt(c.POSTFIELDS, postfields) 這個選項會自動將HTTP request mathod改為POST源碼pycurl/examples/quickstart/form_post.py 很標準
2.異步批量預取的例子在pycurl/examples/retriever-multi.py,很有代表性的
3.使用自定義方法
c.setopt(pycurl.CUSTOMREQUEST,"DELETE") 詳見官網文章 http://curl.haxx.se/libcurl/c/CURLOPT_CUSTOMREQUEST.html http://stackoverflow.com/questions/17075844/how-to-send-delete-request-in-pycurl? ? delete request總結
以上是生活随笔為你收集整理的在Ubuntu 14.04 64bit上使用pycURL模块示例的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 在Ubuntu 14.04 64bit上
- 下一篇: 在Ubuntu 14.04 64bit上