當前位置：首頁 > 人文社科 > 生活经验 >内容正文

生活经验

在Ubuntu 14.04 64bit上使用pycURL模块示例

發布時間：2023/11/27 生活经验 31 豆豆

生活随笔收集整理的這篇文章主要介紹了在Ubuntu 14.04 64bit上使用pycURL模块示例小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

PycURL 傳說是實現Python下多線程網頁抓取的效率最高的解決方案，本質是對libcurl C語言庫的封裝。

在Linux上有個常用的命令 curl（非常好用），支持curl的就是大名鼎鼎的libcurl庫；libcurl是功能強大的，而且是非常高效的函數庫。libcurl除了提供本身的C API之外，還有多達40種編程語言的Binding，這里介紹的PycURL就是libcurl的Python binding。

在Python中對網頁進行GET/POST等請求，當需要考慮高性能的時候，libcurl是非常不錯的選擇，一般來說會比liburl、liburl2快不少，可能也會比Requests的效率更高。特別是使用PycURL的多并發請求時，更是效率很高的。個人感覺，其唯一的缺點是，由于是直接調用的是libcurl C庫，PycURL的函數接口之類的還和C中的東西很像，可能不是那么的Pythonic，寫代碼的學習曲線稍微比liburl高一點兒。

https://github.com/pycurl/pycurl? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?//pycurl模塊的源碼搭建好sphinx環境之后，文檔生成，直接從git源碼下面運行，make docs 就會在build/doc中看到相關文檔信息，進去之后，直接點擊index.html進行查看

下面是我的幾個實踐示例

1.最簡單的網頁獲取

#!/usr/bin/env python
#-*- coding: utf-8 -*-import sys, pycurl, time, cStringIOsys.stderr.write("pycURL version [%s]\n" % pycurl.version)start_time = time.time()url = 'http://www.dianping.com/shanghai'
b = cStringIO.StringIO()
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEFUNCTION, b.write)
c.perform()
end_time = time.time()content = b.getvalue()duration = end_time - start_time
print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
c.close()print 'pycurl takes [%s] seconds to get [%s]' % (duration, url)
print 'length of the content is [%d]' % len(content)

2.簡單的pycURL包裝類

#!/usr/bin/env python
#encoding: utf-8import sys, pycurl, cStringIO, urllibclass Curl:def __init__(self):self.c = pycurl.Curl()def __del__(self):self.c.close()def init(self, verbose):c = self.c;c.setopt(c.FOLLOWLOCATION, 1)c.setopt(c.MAXREDIRS, 5)c.setopt(c.CONNECTTIMEOUT, 30)c.setopt(c.TIMEOUT, 300)c.setopt(c.NOSIGNAL, 1)c.setopt(c.USERAGENT, "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36")c.setopt(c.VERBOSE, verbose)def get(self, url):b = cStringIO.StringIO()c = self.c;c.setopt(c.URL, url)c.setopt(c.WRITEFUNCTION, b.write)c.perform()content = b.getvalue()print "HTTP CODE: ", c.getinfo(c.HTTP_CODE)b.close()return contentdef post(self, url, data):b = cStringIO.StringIO()c = self.c;c.setopt(c.POSTFIELDS, urllib.urlencode(data))c.setopt(c.URL, url)c.setopt(c.WRITEFUNCTION, b.write)c.perform()content = b.getvalue()print "HTTP CODE: ", c.getinfo(c.HTTP_CODE)b.close()return contentdef purge(self, url):cmd = 'PURGE 'proxy = '127.0.0.1:8080'c = self.cc.setopt(c.URL, url)c.setopt(c.PROXY, proxy)c.setopt(c.CUSTOMREQUEST, cmd)c.perform()status = c.getinfo(c.HTTP_CODE)print "HTTP CODE: ", statusreturn statusif __name__ == '__main__':page = 'http://news.sohu.com/'c = Curl()c.init(True)c.get(page)page1 = 'http://www.google.com/'post_data_dic = {"name":"value"}c.post(page1, post_data_dic)page2 = 'http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg'c.purge(page2)

3.簡單的pycURL multi類包裝

#!/usr/bin/env python
#encoding: utf-8import sys, pycurl, cStringIOclass MCurl:def __init__(self, tasks, concurrent):self.taskQ = tasksself.taskQ_size = len(tasks)self.max_conn = concurrentself.resp_dict = {}self.m = pycurl.CurlMulti()def __del__(self):self.m.close()def add_tasks(self):self.max_conn = min(self.taskQ_size, self.max_conn)assert 1 <= self.max_conn <= 100, "invalid number of concurrent urls"print "===Getting %d urls using %d concurrent cURL handle pool===" % (self.taskQ_size, self.max_conn)self.m.handles = []for i in range(self.max_conn):c = pycurl.Curl()c.fp = Nonec.setopt(pycurl.FOLLOWLOCATION, 1)c.setopt(pycurl.MAXREDIRS, 5)c.setopt(pycurl.CONNECTTIMEOUT, 30)c.setopt(pycurl.TIMEOUT, 300)c.setopt(pycurl.NOSIGNAL, 1)self.m.handles.append(c)self.resp_dict['total'] = self.taskQ_sizeself.resp_dict['succ'] = []self.resp_dict['fail'] = []def process_tasks(self):freelist = self.m.handles[:]queue = self.taskQnum_processed = 0while num_processed < self.taskQ_size:#if there is an url to process and a free curl handle, add to multi stackwhile queue and freelist:url, filename = queue.pop(0)c = freelist.pop()c.fp = open(filename, "wb")c.setopt(pycurl.URL, url)c.setopt(pycurl.WRITEDATA, c.fp)self.m.add_handle(c)#store some info for use laterc.filename = filenamec.url = url#run the internal curl state machine for the multi stackwhile 1:ret, num_handles = self.m.perform()if ret != pycurl.E_CALL_MULTI_PERFORM:break#check if curl handle has terminated, and add them to the freelistwhile 1:num_q, ok_list, err_list = self.m.info_read()for c in ok_list:c.fp.close()c.fp = Noneself.resp_dict['succ'].append(c.url)self.m.remove_handle(c)print ("Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL))freelist.append(c)for c, errno, errmsg in err_list:c.fp.close()c.fp = Noneself.resp_dict['fail'].append(c.url)self.m.remove_handle(c)print("Failed: ", c.filename, c.url, errno, errmsg)freelist.append(c)num_processed = num_processed + len(ok_list) + len(err_list)if num_q == 0:break;#currently no more I/O is pending, we just call select() to sleep until some more data is availableself.m.select(1.0)def del_tasks(self):for c in self.m.handles:if c.fp is not None:c.fp.close()c.fp = Nonec.close()def dump_process(self):print self.resp_dict#========= main entry point ==========
#give tasks info
urls = ["http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg",
"http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg", "", "http://m2.biz.itc.cn/pic/new/n/93/87/Img7798793_n.jpg",
"http://m1.biz.itc.cn/pic/new/n/92/87/Img7798792_n.jpg", "http://m3.biz.itc.cn/pic/new/n/94/91/Img7799194_n.jpg",
"http://m1.biz.itc.cn/pic/new/n/96/87/Img7798796_n.jpg", "http://m2.biz.itc.cn/pic/new/n/97/87/Img7798797_n.jpg",
"http://m1.biz.itc.cn/pic/new/n/16/88/Img7798816_n.jpg", "http://m2.biz.itc.cn/pic/new/n/17/88/Img7798817_n.jpg",
"http://m4.biz.itc.cn/pic/new/n/95/87/Img7798795_n.jpg", "http://m4.biz.itc.cn/pic/new/n/91/91/Img7799191_n.jpg"]concurr = 6
queue = []
for url in urls:url = url.strip()if not url or url[0] == "#":continuefilename = "./sohu_%03d.jpg" % (len(queue) + 1)queue.append((url, filename))mc = MCurl(queue, concurr)
mc.add_tasks()
mc.process_tasks()
mc.del_tasks()
mc.dump_process()

運行截圖

4.PURGE等自定義請求實現

#!/usr/bin/env python
#encoding: utf-8import sys, pycurl, cStringIO, urlliburl = 'http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg'
cmd = 'PURGE '
#cmd = 'DELETE '
proxy = '127.0.0.1:8080'
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.VERBOSE, 1)
c.setopt(c.PROXY, proxy)
c.setopt(c.CUSTOMREQUEST, cmd)
try:c.perform()
except Exception as e:print e
status = c.getinfo(c.HTTP_CODE)
print "HTTP CODE: ", status
c.close()

運行截圖

說明：

1.使用post表單時，只需要設置

c.setopt(c.POSTFIELDS, postfields) 這個選項會自動將HTTP request mathod改為POST

源碼pycurl/examples/quickstart/form_post.py 很標準

2.異步批量預取的例子在pycurl/examples/retriever-multi.py，很有代表性的

3.使用自定義方法

c.setopt(pycurl.CUSTOMREQUEST,"DELETE") 詳見官網文章 http://curl.haxx.se/libcurl/c/CURLOPT_CUSTOMREQUEST.html http://stackoverflow.com/questions/17075844/how-to-send-delete-request-in-pycurl? ? delete request

總結

以上是生活随笔為你收集整理的在Ubuntu 14.04 64bit上使用pycURL模块示例的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：在Ubuntu 14.04 64bit上
下一篇：在Ubuntu 14.04 64bit上