python通过url下载文件不可读_python-selenium实现的简易下载器,并常见错误解决
簡易下載器的實現
支持代理、失敗重試、確保包含指定ID元素(可根據需求自定義修改)
# coding: utf-8
from Utils import logging
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class HtmlDownloader:
def __init__(self):
self.driver = webdriver.PhantomJS()
def setProxy(self, proxyStr):
# 利用DesiredCapabilities(代理設置)參數值,重新打開一個sessionId
proxy=webdriver.Proxy()
proxy.proxy_type=ProxyType.MANUAL
proxy.http_proxy=proxyStr
# 將代理設置添加到webdriver.DesiredCapabilities.PHANTOMJS中
proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
def rmProxy(self):
# 還原為系統代理
proxy=webdriver.Proxy()
proxy.proxy_type=ProxyType.DIRECT
proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
def download(self, returnType, url, ensureId, proxyStr = None):
if proxyStr:
self.setProxy(proxyStr)
else:
self.rmProxy()
self.driver.get(url)
# special for xxx.com
# your code here
# ensure for some element
try:
WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.ID, ensureId)))
if returnType == "html":
downloadResult = self.driver.page_source
elif returnType == "bs":
downloadResult = bs(self.driver.page_source, 'lxml')
logging("i", "download %s bytes" % len(self.driver.page_source))
return downloadResult
except Exception,e:
logging("e", str(e))
finally:
self.driver.close()
def safeDownload(self, returnType, url, ensureId, proxyStr = None):
downloadResult = None
failTimes = 0
while not downloadResult:
downloadResult = self.download(returnType, url, ensureId, proxyStr)
if not downloadResult:
failTimes += 1
if failTimes == 5:
logging("w", "failed %s times, will abort" % failTimes)
break
logging("w", "failed %s times, will retry" % failTimes)
return downloadResult
元素不可見導致不能操作的錯誤
# ElementNotVisibleException: Message: {"errorMessage":"Element is not currently visible and may not be manipulated"
# Screenshot: available via screen
首先嘗試設定窗口大小
self.driver.set_window_size(1024, 768)
不行的話再嘗試滾動頁面,如滾動到底部:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
總結
以上是生活随笔為你收集整理的python通过url下载文件不可读_python-selenium实现的简易下载器,并常见错误解决的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: matlab 解方程组_一文读懂MATL
- 下一篇: python123第一周测试作业指导书_