python自动生成采集规则_【仅供学习参考】Python多线程池采集小说,超简单!
[Python] 純文本查看 復制代碼#采集小說lingdiankanshu.co
import requests
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool #多線程
import os
global xsmz
xsmz=''
def cljj(sm):
global xsmz
#url="https://www.lingdiankanshu.co/338379/"
url="https://www.lingdiankanshu.co/{}/".format(sm)
html=requests.get(url,timeout=20).text
#print(html)
ljnr=etree.HTML(html)
#獲取小說名
xsm=ljnr.xpath('//*[@id="info"]/h1/text()')
xsmz=xsm[0]
ljs=[]
ljj=ljnr.xpath('//*[@id="list"]/dl/dd/a/@href')
for lj in ljj:
lj=url+lj
ljs.append(lj)
return(ljs)
def cxsnr(url):
global xsmz
#url="https://www.lingdiankanshu.co/338379/2081667.html"
html=requests.get(url,timeout=20).text
#print(html)
xq=etree.HTML(html)
bt=xq.xpath('//*[@class="bookname"]/h1/text()')
bt=bt[0]
print(bt)
nr=xq.xpath('//*[@id="content"]/text()')
xsxq=''
for nrxq in nr:
nrxq.replace('\u3000\u3000','')
xsxq=xsxq+nrxq+'\r\n'
os.makedirs("./xs/"+xsmz+"/", exist_ok=True)
with open('./xs/'+xsmz+'/'+bt+'.txt','w',encoding='utf-8') as f:
f.write(bt+'\r\n'+xsxq)
print(bt+'.txt---采集成功!')
if __name__ == "__main__":
sm=input('請輸入書目錄:')
urls=cljj(sm)
print(urls)
try:
# 開4個 worker,沒有參數時默認是 cpu 的核心數
pool = ThreadPool()
results = pool.map(cxsnr,urls)
pool.close()
pool.join()
except:
print("Error: unable to start thread")
總結
以上是生活随笔為你收集整理的python自动生成采集规则_【仅供学习参考】Python多线程池采集小说,超简单!的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 你是外包,麻烦不要偷吃零食,注意素质..
- 下一篇: java 斐波那契编程_Java实现Fi