简单记录一下使用python pyppeteer爬取努努书坊的爬虫
生活随笔
收集整理的這篇文章主要介紹了
简单记录一下使用python pyppeteer爬取努努书坊的爬虫
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
RT,是用python+pyppeteer隨便寫的,比較簡單,權(quán)當(dāng)做記錄備忘而已。
不作任何這方面的疑問解答。
import pyppeteer,asyncio,sys,io,os,re,time,datetime import openpyxl sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') from pyquery import PyQuery as pq from bs4 import BeautifulSoupexecutable_path = "C:\\chrome-win32-chromium-588429\\chrome.exe" noveltype = '華人'def appeartimes(string, s):count = 0index = 0while True:index = string.find(s, index)if index != -1:count += 1index += len(s)else:breakreturn countdef complete(s):return "0"+str(s) if s<10 else str(s)def now():d = datetime.datetime.now()return str(d.year) + "-" + complete(d.month) + "-" + complete(d.day) + " " + complete(d.hour) + ":" + complete(d.minute) + ":" + complete(d.second)def filter(s):try:return re.sub(r'</?\w+[^>]*>','',s)except:return s async def get_liebiao(url):# 打開瀏覽器browser = await pyppeteer.launch(executablePath=executable_path,headless=True, userDataDir='D:\\temporary',args=['--no-sandbox'])page = await browser.newPage()await page.goto(url)content = await page.content()_= pq(content.encode('utf-8').decode('utf-8'))liebiao = os.getcwd() + "\\" + _('#main font').eq(0).html()print("開始爬取:"+liebiao)recorder = os.getcwd() + "\\任務(wù)總列表.xlsx"if not os.path.exists(recorder):wb = openpyxl.Workbook()ws = wb.activewb.save(recorder)wb = openpyxl.load_workbook(recorder)ws = wb.activeif not os.path.exists(liebiao):print("創(chuàng)建"+liebiao+"文件夾...")os.mkdir(liebiao)trslen = len(_("#main tr"))row = 1for i in range(1,trslen):suoyin = _("#main tr:eq("+str(i)+") td:eq(0)").html()if not os.path.exists(liebiao + "\\" + suoyin):print("在" + liebiao + " 下創(chuàng)建索引文件夾: " + suoyin + " ...")os.mkdir(liebiao + "\\" + suoyin)all_a = _("#main tr:eq("+str(i)+") a")for a in all_a.items():zuojia = a.html()href = 'https://www.kanunu8.com/' + a.attr('href')if not os.path.exists(liebiao + "\\" + suoyin + "\\" + zuojia):print("在" + suoyin + " 下創(chuàng)建作家文件夾: " + zuojia + " ...")os.mkdir(liebiao + "\\" + suoyin + "\\" + zuojia)ws.cell(row,1,suoyin)ws.cell(row,2,zuojia)ws.cell(row,3,href)row += 1wb.save(recorder)# 關(guān)閉瀏覽器await browser.close()return 1async def get_zuojia():browser = await pyppeteer.launch(executablePath=executable_path,headless=True, userDataDir='D:\\temporary',args=['--no-sandbox'])page = await browser.newPage()recorder = os.getcwd() + "\\任務(wù)總列表.xlsx"wb = openpyxl.load_workbook(recorder)ws = wb.activeliebiao = os.getcwd() + "\\" + '任務(wù)總列表'if not os.path.exists(liebiao):print("創(chuàng)建 " + liebiao + " 文件夾...")os.mkdir(liebiao)for row in range(1,ws.max_row+1):url = ws.cell(row,3).valueawait page.goto(url)content = await page.content()_= pq(content.encode('utf-8').decode('utf-8'))try:jianjie = filter(_("table:eq(8) table:eq(1) p").html())except:jianjie = ''if jianjie is None:jianjie = ''txt = os.getcwd() + "\\"+noveltype+"作家列表\\" + ws.cell(row,1).value + "\\" + ws.cell(row,2).value + "\\簡介.txt"if not os.path.exists(txt):with open(txt,"w",encoding='utf-8') as f:f.write(jianjie)print(ws.cell(row,1).value+" "+ws.cell(row,2).value+" 簡介記錄完成...",flush=True)target = _("table:eq(8) table:eq(1) table tr")recorder = liebiao + "\\" + ws.cell(row,1).value + " " + ws.cell(row,2).value + ".xlsx"if not os.path.exists(recorder):wbi = openpyxl.Workbook()wsi = wbi.activewbi.save(recorder)wbi = openpyxl.load_workbook(recorder)wsi = wbi.activefenlei = ''i = 1for t in target.items():if t.find("a").html() is None:if t.find("strong"):newfenlei = filter(t.find("strong").html().replace("[TXT全文]","").replace("在線閱讀","").replace(" ","").strip())else:try:newfenlei = filter(t.find("span").html().replace("[TXT全文]","").replace("在線閱讀","").replace(" ","").strip())except:try:newfenlei = filter(t.find("td").html().replace("[TXT全文]","").replace("在線閱讀","").replace(" ","").strip())except:newfenlei = "" #因為有些tr行干脆就是空的,沒轍,只能這樣,這個是他的系統(tǒng)做的不好if newfenlei != "":fenlei = newfenleielse:all_a = t.find("a")for a in all_a.items():if a.find("img"):continuexiaoshuo = filter(a.html()).replace("《","").replace("》","").replace("[TXT全文]","").replace("在線閱讀","").strip()href = "https://www.kanunu8.com" + a.attr('href')wsi.cell(i,1,fenlei)wsi.cell(i,2,xiaoshuo)wsi.cell(i,3,href)i += 1wbi.save(recorder)print(ws.cell(row,1).value + " " + ws.cell(row,2).value + " 小說列表記錄完成...",flush=True)await browser.close()print('完成',flush=True)async def get_zuopin():browser = await pyppeteer.launch(executablePath=executable_path,headless=True, userDataDir='D:\\temporary',args=['--no-sandbox'])page = await browser.newPage()page2 = await browser.newPage()missions = os.getcwd() + "\\任務(wù)總列表"paqu = os.getcwd() + "\\"+noveltype+"作家列表"if not os.path.exists(paqu):print(now()+" "+ "創(chuàng)建"+paqu+"文件夾...",flush=True)os.mkdir(paqu)def fuckfile(f):letter = ['0']for l in letter:if l in f:return Falseif "匪我思存" in f:return Falsereturn Trueprint(now()+" "+ "任務(wù)開始...",flush=True)for root, dirs, files in os.walk(missions):for file in files: if fuckfile(file):wb = openpyxl.load_workbook(missions+"\\"+file)ws = wb.activename = file.split(".")[0]print(now()+" "+ '開始下載作家:'+" "+name + " 的作品...",flush=True)paqupath = paqu + "\\" + name.split(" ")[0] if not os.path.exists(paqupath):os.mkdir(paqupath)paqupath = paqupath + "\\" + name.split(" ")[1] if not os.path.exists(paqupath):os.mkdir(paqupath)#if ws.max_row<2: #openpyxl即使是空的,max_row最大行數(shù)也為1# continuefor i in range(1,ws.max_row+1):print(now()+" "+ file + "共"+str(ws.max_row)+"部,開始下載第"+str(i)+"部,"+re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,2).value)+"...",flush=True)if ws.cell(i,1).value=='' or ws.cell(i,1).value is None:ws.cell(i,1,'作品')zimulu = paqupath + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,1).value)if not os.path.exists(zimulu):os.mkdir(zimulu)txt = zimulu + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,2).value) + ".txt"if (not os.path.exists(txt)) and (appeartimes(ws.cell(i,3).value,'http')<2):await page.goto(ws.cell(i,3).value,{'timeout': 1000*60*10})content = await page.content()_= pq(content.encode('utf-8').decode('utf-8'))if _("p").parents('table').html() is not None:neirong = BeautifulSoup(_("p").parents('table').html(),"html.parser").textwith open(txt,"w",encoding='utf-8') as f:f.write(neirong)else:zuopinmulu = zimulu + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,2).value)if not os.path.exists(zuopinmulu):os.mkdir(zuopinmulu)target = _("table:eq(8) tr:eq(3) table:eq(1) tr")if (target.html() is None):target = _("table:eq(8) table:eq(3) table:eq(1) tr")if (target.html() is None):target = _(".book dl dd")if (target.html() is None):target = _(".book table:eq(1) tr")inner = ''fenlei = '正文'newfenlei = "正文"count = 1for t in target.items():if t.find("a").html() is None:newfenlei = ""if t.find("strong"):newfenlei = filter(t.find("strong").html().replace("[TXT全文]","").replace("在線閱讀","").replace(" ","").strip())elif t.find("span"):newfenlei = filter(t.find("span").html().replace("[TXT全文]","").replace("在線閱讀","").replace(" ","").strip())elif t.find("td"):newfenlei = filter(t.find("td").html().replace("[TXT全文]","").replace("在線閱讀","").replace(" ","").strip())else:newfenlei = "正文" #因為有些tr行干脆就是空的,沒轍,只能這樣,這個是他的系統(tǒng)做的不好if newfenlei != "":fenlei = newfenleiinner = zuopinmulu + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",fenlei)if not os.path.exists(inner):os.mkdir(inner)if newfenlei != fenlei:fenlei = newfenleiprint(now()+" "+ "開始下載 " + file + ' ' + ws.cell(i,1).value + ' ' + ws.cell(i,2).value +fenlei+" ...",flush=True)else:all_a = t.find("a")for a in all_a.items():mingzi = re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",filter(a.html().strip()))innertxt = inner + "\\" + str(count) + " " + mingzi + ".txt"if not os.path.exists(innertxt):print(now()+" "+ "正在下載 " + file + ' ' + ws.cell(i,1).value + ' ' + ws.cell(i,2).value +' '+ fenlei + " " + mingzi + "...",flush=True)href = ws.cell(i,3).valueif '.html' in href:while True:href = href[:-1]if href[len(href)-1]=='/':breakhref = href + a.attr("href")#print(href)await page2.goto(href,{'timeout': 1000*60*10})content2 = await page2.content()_2= pq(content2.encode('utf-8').decode('utf-8'))if _2("p").parents('table').html() is not None:innerneirong = BeautifulSoup(_2("p").parents('table').html(),"html.parser").textelif _2("#content").html() is not None:innerneirong = BeautifulSoup(_2("#content").html(),"html.parser").textelif _2("#Article").html() is not None:fuck = _2("#Article .text").html() #不知道為什么不可以直接載入beautifulsoup里面,可能是時間太快?innerneirong = BeautifulSoup(fuck,"html.parser").textelse:fuck = _2("body table:eq(4) td:eq(1)").html()innerneirong = BeautifulSoup(fuck,"html.parser").textwith open(innertxt,"w",encoding='utf-8') as f:f.write(innerneirong) count +=1print(now()+" " + file + ' ' + "第"+str(i)+"部下載完畢...",flush=True)print(now()+" "+ "作家 "+file+" 全部作品下載完畢...",flush=True)await browser.close()url_list = ["https://www.kanunu8.com/author1.html"] task = (get_zuopin() for url in url_list)loop = asyncio.get_event_loop() results = loop.run_until_complete(asyncio.gather(*task)) # for content in results: # print(1)?
總結(jié)
以上是生活随笔為你收集整理的简单记录一下使用python pyppeteer爬取努努书坊的爬虫的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 用javascript模拟分子扩散并思考
- 下一篇: 转载:python pyppeteer手