批量导出某个简书用户的所有文章列表和文章超链接
生活随笔
收集整理的這篇文章主要介紹了
批量导出某个简书用户的所有文章列表和文章超链接
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
簡書改版后,根據文章標題搜索文章的功能就不見了。
雖然簡書提供了批量下載文章的功能,但是下載到本地的文章都是markdown格式的,不包含文章的鏈接,這不滿足我的需求。
既然我是程序員,沒有這個功能我就自己實現一個。
打開簡書首頁,發現默認只顯示8篇文章,用鼠標滑動到屏幕底部后,會觸發一個懶加載事件,到后臺讀取更多的文章列表,所以文章讀取在服務器端是采取的分頁實現。
打開Chrome開發者工具,觀察網絡請求,請求url中99b8712e8850是我簡書用戶id,page=2,3,4這些是分頁代碼。
每頁的文章內容以html格式包含在響應結構里:
我關心的只是文章標題和文章鏈接,如上圖高亮字段所示。
最開始我寫了一個nodejs應用,代碼如下:
var request = require('request'); var jsdom = require("jsdom"); var JSDOM = jsdom.JSDOM; const PREFIX = "https://www.jianshu.com"; const PAGE = "https://www.jianshu.com/u/99b8712e8850?order_by=shared_at&page="; const MAX = 2;var mArticleResult = new Map(); var pageNumber; /* a given article: https://www.jianshu.com/p/963cd23fb092value got from API: /p/5c1d0319dc42 */ var lastPageReached = false; var url = "";var aHandlers = [];// use limited for loop to ease testing for(var i = 0; i < MAX; i++){pageNumber = i + 1;var url = PAGE + pageNumber;// console.log("current page: " + url);var pageOptions = {url: url,method: "GET",headers: {"Accept": "text/html"}};aHandlers.push(getArticles(pageOptions, pageNumber));if( lastPageReached)break; }console.log("promise handler size: " + aHandlers.length);Promise.all(aHandlers).then(function(){var articleIndex = 0;for (var [key, value] of mArticleResult) {console.log("Article[" + articleIndex++ + "]: " + key + " = " + value);}console.log("done"); });function getArticles(pageOptions, pageNumber) {return new Promise(function(resolve,reject){var requestC = request.defaults({jar: true});requestC(pageOptions,function(error,response,body){if( error){console.log("error: " + error);resolve(error);}var document = new JSDOM(body).window.document;var content = document.getElementsByTagName("li");for( var i =0; i < content.length; i++){var li = content[i];var children = li.childNodes;for( var j = 0; j < children.length; j++){var eachChild = children[j];if( eachChild.nodeName == "DIV"){var grandChild = eachChild.childNodes;for( var k = 0; k < grandChild.length; k++){var grand = grandChild[k];if( grand.nodeName == "A"){var fragment = grand.getAttribute("href");if( fragment.indexOf("/p") < 0)continue;console.log("title: " + grand.text);var wholeURL = PREFIX + fragment;console.log("url: " + wholeURL);if( mArticleResult.has(grand.text)){lastPageReached = true;console.log("article size: " + mArticleResult.size);resolve(pageNumber);}mArticleResult.set(grand.text, wholeURL);}}}}}// end of outer loopresolve(pageNumber);}); }); }原理就是使用nodejs的request module,向簡書網站同時發起多個請求,每個請求讀取一頁的簡書文章。
后來發現這種方法在并發請求數大于10個的時候就無法工作,簡書網站會拒絕該類請求,返回HTTP 429狀態碼。
所以最后我采用了最簡單的同步請求實現,使用了nodejs提供的sync-request在循環里發起請求。
var request = require("sync-request"); var jsdom = require("jsdom"); var JSDOM = jsdom.JSDOM; var textEncoding = require('text-encoding'); var textDecoder = textEncoding.TextDecoder;const PREFIX = "https://www.jianshu.com"; const PAGE = "https://www.jianshu.com/u/99b8712e8850?order_by=shared_at&page="; const MAX = 100;var mArticleResult = new Map(); var lastPageReached = false; var pageNumber; /* a given article: https://www.jianshu.com/p/963cd23fb092value got from API: /p/5c1d0319dc42 */try {// use limited for loop to ease testingfor (var i = 0; i < MAX; i++) {if( lastPageReached)break;pageNumber = i + 1;var url = PAGE + pageNumber;console.log("current page: " + url);var response = request('GET', url);var html = new textDecoder("utf-8").decode(response.body);handleResponseHTML(html);} } catch (e) {}var articleIndex = 0; var resultHTML = "<html>";const fs = require('fs');/* <HTML> <p> <a href="https://www.baidu.com">eee</a> </p><p><a>22</a></p> <p><a>33</a></p> </HTML> */var index = 1; for (var [key, value] of mArticleResult) {var article = "<p><a href=\"" + key + "\">" + index++ + ". " + value + "</a></p>" + "\n";resultHTML = resultHTML + article;console.log("Article[" + articleIndex++ + "]: " + value + " = " + key); }resultHTML = resultHTML + "</html>";var pwd = process.cwd() + "/jianshu.html";fs.appendFileSync(pwd, resultHTML);console.log("done");function handleResponseHTML(html) {var document = new JSDOM(html).window.document;var content = document.getElementsByTagName("li");for (var i = 0; i < content.length; i++) {var li = content[i];var children = li.childNodes;for (var j = 0; j < children.length; j++) {var eachChild = children[j];if (eachChild.nodeName == "DIV") {var grandChild = eachChild.childNodes;for (var k = 0; k < grandChild.length; k++) {var grand = grandChild[k];if (grand.nodeName == "A") {var fragment = grand.getAttribute("href");if (fragment.indexOf("/p") < 0)continue;// console.log("title: " + grand.text);var wholeURL = PREFIX + fragment;// console.log("url: " + wholeURL);if (mArticleResult.has(wholeURL)) {lastPageReached = true;console.log("article size: " + mArticleResult.size);return;}mArticleResult.set(wholeURL, grand.text);}}}}} }這個nodejs應用執行后,會在本地生成一個html文件,包含每篇文章的標題和超鏈接。
要獲取更多Jerry的原創文章,請關注公眾號"汪子熙":
總結
以上是生活随笔為你收集整理的批量导出某个简书用户的所有文章列表和文章超链接的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 台积电 3 月营收约 1454 亿元新台
- 下一篇: Jerry Wang在SAP社区上获得的