一段基于Jsoup和Dom4j的海报爬取小程序
生活随笔
收集整理的這篇文章主要介紹了
一段基于Jsoup和Dom4j的海报爬取小程序
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
2019獨角獸企業重金招聘Python工程師標準>>>
/*** */ package com.pan.tools;import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.ResourceBundle;import org.dom4j.Document; import org.dom4j.DocumentHelper; import org.dom4j.Element; import org.dom4j.io.OutputFormat; import org.dom4j.io.XMLWriter; import org.jsoup.Jsoup; import org.jsoup.select.Elements;/*** @author Javay* * 2012-9-7下午3:13:10* */ public class MovieRssCNGenerator {private ResourceBundle bundle = ResourceBundle.getBundle("xmlCN");private final static int RETRY_TIME = 3;public static String getDateTime() { return new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date());}/*** 解析獲取豆瓣電影宣傳海報信息* * @return */public Elements getDoubanMovieSlidePic() {int time = 0;do{try {org.jsoup.nodes.Document doc = Jsoup.connect(bundle.getString("douban")).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(20000).post();org.jsoup.nodes.Element screeningbd = doc.select("div.screening-bd").first();Elements slideItems = screeningbd.select("li.poster");return slideItems;} catch (IOException e) { time++;if(time < RETRY_TIME){System.out.println("請求超時,進行第"+time+"次重連。");try {Thread.sleep(3000);} catch (InterruptedException e1) {continue;}}}}while(time < RETRY_TIME);return null;}/*** 生成XML文件* * @param items*/public void createXMLDoc(Elements items) {Document doc = DocumentHelper.createDocument();doc.addComment("panmay.com"+this.getDateTime());Element root = doc.addElement("movies");if (items != null) {for (org.jsoup.nodes.Element item : items) {Element movie = root.addElement("movie");Element title = movie.addElement("title");title.setText(item.select("img").attr("alt").trim());Element link = movie.addElement("link");link.setText(item.select("a").attr("href"));String img = item.select("img").attr("data-original");Element pic = movie.addElement("pic");if ("".equals(img)) {pic.setText(item.select("img").attr("src").trim());} else {pic.setText(img.trim());}}String directory = bundle.getString("xmlPath");String fileName = bundle.getString("fileName");OutputFormat format = OutputFormat.createPrettyPrint();format.setEncoding("UTF-8");File file = new File(directory);if (!file.exists()) {System.out.println("目錄不存在,創建一個新的文件輸出路徑: " + file);file.mkdirs();}try {FileOutputStream fos = new FileOutputStream(directory+ fileName + ".xml");try {XMLWriter writer = new XMLWriter(fos, format);try {writer.write(doc);} catch (IOException e) {e.printStackTrace();} finally {if (writer != null) {try {writer.close();System.out.println(fileName + "文件輸出完畢!");} catch (IOException e) {e.printStackTrace();}}}} catch (UnsupportedEncodingException e) {e.printStackTrace();}} catch (FileNotFoundException e1) {e1.printStackTrace();}} else {System.out.println("數據讀取失敗!程序終止!");}}/*** @param args*/public static void main(String[] args) {MovieRssCNGenerator robot = new MovieRssCNGenerator();robot.createXMLDoc(robot.getDoubanMovieSlidePic());}}轉載于:https://my.oschina.net/panjavay/blog/77416
總結
以上是生活随笔為你收集整理的一段基于Jsoup和Dom4j的海报爬取小程序的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 向量几何在游戏编程中的使用系列二之2-D
- 下一篇: 神探tcpdump第三招