當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

使用Nodejs实现的小说爬虫

發布時間：2024/8/26 编程问答 28 豆豆

生活随笔收集整理的這篇文章主要介紹了使用Nodejs实现的小说爬虫小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

1 //引入模塊 2 const http = require('http') 3 const fs = require('fs') 4 const cheerio = require('cheerio') 5 const iconv = require('iconv-lite') 6 //第一章url 7 const url = 'http://www.81zw.com/book/8634/745331.html' 8 //開始章節數 9 let i = 1 10 //最大獲取章節數 11 let num = 100 12 13 function main(url) { 14 startRequest(url) 15 } 16 17 function startRequest(url) { 18 http.get(url, res => { 19 //定義空數組存放html 20 const html = [] 21 res.on('data', (chunk) => { 22 //把數據塊添加進數組 23 html.push(chunk) 24 }) 25 res.on('end', () => { 26 //獲取數據完畢后，使用iconv-lite轉碼，decedo中為Buffer對象，Buffer.concat為數組 27 const html1 = iconv.decode(Buffer.concat(html), 'gbk') 28 //使用cheerio解析html，cheerio模塊的語法跟jQuery基本一樣 29 const $ = cheerio.load(html1, {decodeEntities: false}) 30 //處理數據 31 const title = $('.bookname h1').text() 32 const arr = [] 33 const content = $("#content").html() 34 //分析結構后分割html 35 const contentArr = content.split('<br><br>') 36 contentArr.forEach(elem => { 37 //去除內容的兩端空格和  38 const data = trim(elem.toString()) 39 arr.push(data) 40 }) 41 const bookName = $(".con_top a").eq(2).text() 42 //定義存入數據庫的對象 43 const obj = { 44 id: i, 45 err: 0, 46 bookName: bookName, 47 title: title, 48 content: arr 49 } 50 51 let url2 = url.split('/')[url.split('/').length - 2] 52 const link = $(".bottem2 a").eq(2).attr('href') 53 //獲取當前章節的下一章地址，遞歸調用fetchPage 54 const nextLink = `http://www.81zw.com/book/${url2}/${link}` 55 saveContent(obj, nextLink) 56 console.log(`第${i + 1}章：${nextLink}`) 57 i++ 58 if (i <= num) { 59 setTimeout(() => { 60 main(nextLink) 61 }, 100) 62 } 63 }) 64 }) 65 } 66 67 function saveContent(obj, nextLink) { 68 console.log(`${i}--${obj.title}`) 69 //判斷書名文件夾是否存在，不存在則創建 70 if (!fs.existsSync(`data/${obj.bookName}`)) { 71 fs.mkdirSync(`data/${obj.bookName}`) 72 } 73 //寫入json文件 74 fs.writeFile(`./data/${obj.bookName}/chapter${i}.json`, JSON.stringify(obj), 'utf-8', err => { 75 if (err) throw err 76 }) 77 } 78 79 function trim(str) { 80 return str.replace(/(^\s*)|(\s*$)/g, '').replace(/ /g, '') 81 } 82 83 main(url)

生成文件

轉載于:https://www.cnblogs.com/tgxh/p/6754649.html

總結

以上是生活随笔為你收集整理的使用Nodejs实现的小说爬虫的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：【BZOJ 2721】 2721: [V
下一篇： form插件的核心方法--ajaxFor