java电子报刊网站_采集电子报纸 - 杨尚川的个人页面 - OSCHINA - 中文开源技术交流社区...
1、接口
/**
*報紙采集器
*?@author?楊尚川
*/
public?interface?PaperCollector?{
/**
*?下載當日報紙,一個文件對應一個版面
*?@return?報紙
*/
List?collect();
/**
*?下載指定日期的報紙,一個文件對應一個版面
*?@param?date?指定日期
*?@return?報紙
*/
List?collect(Date?date);
}
2、抽象類
/**
*報紙采集器抽象類,通用采集功能實現
*?@author?楊尚川
*/
public?abstract?class?AbstractPaperCollector?implements?PaperCollector{
protected?final?Logger?LOG?=?LoggerFactory.getLogger(getClass());
@Override
public?List?collect()?{
return?collect(new?Date());
}
/**
*?根據下載鏈接提取文件夾名稱
*?@param?href?下載鏈接
*?@return?文件夾名稱
*/
protected?abstract?String?getPath(String?href);
/**
*?根據下載鏈接提取文件名稱
*?@param?href?下載鏈接
*?@return?文件名稱
*/
protected?abstract?String?getFile(String?href);
protected?List?downloadPaper(List?hrefs){
final?List?files?=?new?ArrayList<>();
List?ts?=?new?ArrayList<>();
LOG.info("報紙有"+hrefs.size()+"個版面需要下載:");
for(final?String?href?:?hrefs){
Thread?t?=?new?Thread(new?Runnable(){
@Override
public?void?run()?{
File?file?=?downloadPaper(href);
if(file?!=?null){
files.add(file);
}
}
});
t.start();
ts.add(t);
}
for(Thread?t?:?ts){
try?{
t.join();
}?catch?(InterruptedException?ex)?{
LOG.error("下載報紙出錯:",ex);
}
}
return?files;
}
protected?File?downloadPaper(String?href){
try{
LOG.info("下載報紙:"+href);
String?path?=?getPath(href);
LOG.debug("報紙保存目錄:"+path);
String?file?=?getFile(href);
LOG.debug("報紙保存文件:"+file);
File?dir?=?new?File(path);
if(!dir.exists()){
LOG.debug("創建目錄:"+dir.getAbsolutePath());
dir.mkdirs();
}
File?absoluteFile?=?new?File(path,?file);
LOG.debug("報紙保存絕對路徑:"+absoluteFile.getAbsolutePath());
Tools.copyFile(new?URL(href).openStream(),?absoluteFile);
LOG.info("報紙下載成功:"+href);
LOG.info("報紙成功保存到:"+absoluteFile.getAbsolutePath());
return?absoluteFile;
}catch(IOException?e){
LOG.error("報紙下載失敗:"+e);
}
return?null;
}
protected?void?run()?{
//今天
List?files?=?collect();
int?i?=?1;
for(File?file?:?files){
LOG.info((i++)+"?:?"?+?file.getAbsolutePath());
}
//昨天
Date?date?=?new?Date();
date.setTime(System.currentTimeMillis()-24*3600*1000);
files?=?collect(date);
i?=?1;
for(File?file?:?files){
LOG.info((i++)+"?:?"?+?file.getAbsolutePath());
}
//前天
date?=?new?Date();
date.setTime(System.currentTimeMillis()-2*24*3600*1000);
files?=?collect(date);
i?=?1;
for(File?file?:?files){
LOG.info((i++)+"?:?"?+?file.getAbsolutePath());
}
}
}
3、采集新華日報
/**
*?新華日報
*?@author?楊尚川
*/
public?class?XHRBPaperCollector?extends?AbstractPaperCollector{
private?static?final?String?paperName?=?"新華日報";
private?static?final?String?paperPath?=?"http://xh.xhby.net/newxh/";
private?static?final?String?url?=?paperPath+"html/";
private?static?final?String?hrefPrefix?=?paperPath+"page/1/";
private?static?final?String?start?=?"node_2.htm";
private?static?final?String?pdfCssQuery?=?"html?body?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?div?table?tbody?tr?td?a";
private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyy-MM/dd/");
@Override
public?List?collect(Date?date)?{
List?hrefs?=?new?ArrayList<>();
try?{
LOG.debug("url:?"+url);
String?paper?=?url?+?sf.format(date)?+?start;
LOG.debug("paper:?"+paper);
Document?document?=?Jsoup.connect(paper).get();
LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);
Elements?elements?=?document.select(pdfCssQuery);
for(Element?element?:?elements){
String?href?=?element.attr("href");
if(href?!=?null?&&?href.endsWith(".pdf")){
LOG.debug("報紙鏈接:"+href);
href?=?href.replace("../../../",?"");
LOG.debug("報紙鏈接:"+href);
hrefs.add(paperPath+href);
}else{
LOG.debug("不是報紙鏈接:"+href);
}
}
}?catch?(IOException?ex)?{
LOG.error("采集出錯",ex);
}
return?downloadPaper(hrefs);
}
@Override
protected?String?getPath(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
attrs?=?attrs[0].split("-");
StringBuilder?str?=?new?StringBuilder();
str.append(paperName)
.append(File.separator)
.append(attrs[0])
.append("-")
.append(attrs[1])
.append(File.separator)
.append(attrs[2]);
return?str.toString();
}
@Override
protected?String?getFile(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
String?file?=?attrs[1]+".pdf";
return?file;
}
public?static?void?main(String[]?args)?{
new?XHRBPaperCollector().run();
}
}
4、采集楚天都市報
/**
*?楚天都市報
*?@author?楊尚川
*/
public?class?CTDSBPaperCollector?extends?AbstractPaperCollector{
private?static?final?String?paperName?=?"楚天都市報";
private?static?final?String?host?=?"http://ctdsb.cnhubei.com/";
private?static?final?String?paperPath?=?host+"ctdsb/";
private?static?final?String?url?=?host+"html/ctdsb/";
private?static?final?String?hrefPrefix?=?paperPath;
private?static?final?String?start?=?"index.html";
private?static?final?String?pdfCssQuery?=?"html?body?center?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?div?table?tbody?tr?td.info3?a";
private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyyMMdd/");
@Override
public?List?collect(Date?date)?{
List?hrefs?=?new?ArrayList<>();
try?{
LOG.debug("url:?"+url);
String?paper?=?url?+?sf.format(date)?+?start;
LOG.debug("paper:?"+paper);
Document?document?=?Jsoup.connect(paper).get();
LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);
Elements?elements?=?document.select(pdfCssQuery);
int?count=0;
for(Element?element?:?elements){
String?text?=?element.text();
if(text?!=?null?&&?text.startsWith("第")){
LOG.debug("報紙文本:"+text);
count++;
}else{
LOG.debug("不是報紙文本:"+text);
}
}
//有的版面缺失,而文件名是順序遞增的
for(int?i=1;?i<=count;?i++){
String?seq?=?Integer.toString(i);
if(i<10){
seq="0"+seq;
}
hrefs.add(paperPath?+?sf.format(date)?+?"page_"+seq+".jpg");
}
}?catch?(IOException?ex)?{
LOG.error("采集出錯",ex);
}
return?downloadPaper(hrefs);
}
@Override
protected?String?getPath(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
StringBuilder?str?=?new?StringBuilder();
str.append(paperName)
.append(File.separator)
.append(attrs[0].substring(0,?4))
.append("-")
.append(attrs[0].substring(4,?6))
.append(File.separator)
.append(attrs[0].substring(6,?8));
return?str.toString();
}
@Override
protected?String?getFile(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
String?file?=?attrs[1].split("_")[1];
return?file;
}
public?static?void?main(String[]?args)?{
new?CTDSBPaperCollector().run();
}
}
5、采集京九晚報
/**
*?京九晚報
*?@author?楊尚川
*/
public?class?JJWBPaperCollector?extends?AbstractPaperCollector{
private?static?final?String?paperName?=?"京九晚報";
private?static?final?String?paperPath?=?"http://epaper.cnsq.com.cn/jjwb/";
private?static?final?String?url?=?paperPath+"html/";
private?static?final?String?hrefPrefix?=?paperPath+"page/10/";
private?static?final?String?start?=?"node_11.htm";
private?static?final?String?pdfCssQuery?=?"html?body?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?div?table?tbody?tr?td?a";
private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyy-MM/dd/");
@Override
public?List?collect(Date?date)?{
List?hrefs?=?new?ArrayList<>();
try?{
LOG.debug("url:?"+url);
String?paper?=?url?+?sf.format(date)?+?start;
LOG.debug("paper:?"+paper);
Document?document?=?Jsoup.connect(paper).get();
LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);
Elements?elements?=?document.select(pdfCssQuery);
for(Element?element?:?elements){
String?href?=?element.attr("href");
if(href?!=?null?&&?href.endsWith(".pdf")){
LOG.debug("報紙鏈接:"+href);
href?=?href.replace("../../../",?"");
LOG.debug("報紙鏈接:"+href);
hrefs.add(paperPath+href);
}else{
LOG.debug("不是報紙鏈接:"+href);
}
}
}?catch?(IOException?ex)?{
LOG.error("采集出錯",ex);
}
return?downloadPaper(hrefs);
}
@Override
protected?String?getPath(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
StringBuilder?str?=?new?StringBuilder();
str.append(paperName)
.append(File.separator)
.append(attrs[0])
.append(File.separator)
.append(attrs[1]);
return?str.toString();
}
@Override
protected?String?getFile(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
String?file?=?attrs[2]+".pdf";
return?file;
}
public?static?void?main(String[]?args)?{
new?JJWBPaperCollector().run();
}
}
6、采集信息時報
/**
*?信息時報
*?@author?楊尚川
*/
public?class?XXSBPaperCollector?extends?AbstractPaperCollector{
private?static?final?String?paperName?=?"信息時報";
private?static?final?String?host?=?"http://informationtimes.dayoo.com/";
private?static?final?String?paperPath?=?host+"page/1019/";
private?static?final?String?url?=?host+"html/";
private?static?final?String?hrefPrefix?=?paperPath;
private?static?final?String?start?=?"node_1019.htm";
private?static?final?String?pdfCssQuery?=?"html?body#content?div.container?div.leftcolumn?div.leftcolumncontent?div.pagebuttontwo?div.con?p.right?span.dfive?a";
private?static?final?String?subCssQuery?=?"html?body#listcontent?div.container?div.rightcolumn?div.subcbga?div.listcontent?div#all_article_list.list?h4?span.left?a";
private?static?final?String?contentCssQuery?=?"html?body?div.container?div.leftcolumn?div.tbga?div.bbga?div.cbga?div.left?div.pagepicture?div?map?area";
private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyy-MM/dd/");
@Override
public?List?collect(Date?date)?{
List?hrefs?=?new?ArrayList<>();
try?{
LOG.debug("url:?"+url);
String?paper?=?url?+?sf.format(date)?+?start;
LOG.debug("paper:?"+paper);
Document?document?=?Jsoup.connect(paper).get();
//1、找到子報紙
LOG.debug("subCssQuery:?"?+?subCssQuery);
Elements?elements?=?document.select(subCssQuery);
for(Element?element?:?elements){
String?text?=?element.text();
String?href?=?element.attr("href");
if(text?!=?null?&&?text.contains(":")?&&?href?!=?null?&&?href.endsWith(".htm")){
String?subPaperURL?=?url?+?sf.format(date)?+?href;
LOG.debug("子報紙文本:"+text+"?,?"+href);
LOG.debug("subPaperURL:"+subPaperURL);
//2、找到內容頁面
LOG.debug("contentCssQuery:?"?+?contentCssQuery);
Elements?contentElements?=?Jsoup.connect(subPaperURL).get().select(contentCssQuery);
for(Element?contentElement?:?contentElements){
String?h?=?contentElement.attr("href");
if(h?!=?null?&&?h.startsWith("content_")?&&?h.endsWith(".htm")){
String?contentURL?=?url?+?sf.format(date)?+?h;
LOG.debug("contentURL:"+contentURL);
//3、找PDF
LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);
Elements?pdfElements?=?Jsoup.connect(contentURL).get().select(pdfCssQuery);
for(Element?pdfElement?:?pdfElements){
String?pdf?=?pdfElement.attr("href");
if(pdf?!=?null?&&?pdf.endsWith(".pdf")){
LOG.debug("報紙鏈接:"+pdf);
pdf?=?pdf.replace("../../../",?"");
LOG.debug("報紙鏈接:"+pdf);
hrefs.add(host+pdf);
}else{
LOG.debug("不是報紙鏈接:"+pdf);
}
}
//有多個content,選擇一個即可
break;
}
}
}else{
LOG.debug("不是子報紙文本:"+text+"?,?"+href);
}
}
}?catch?(IOException?ex)?{
LOG.error("采集出錯",ex);
}
return?downloadPaper(hrefs);
}
@Override
protected?String?getPath(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
StringBuilder?str?=?new?StringBuilder();
str.append(paperName)
.append(File.separator)
.append(attrs[0])
.append(File.separator)
.append(attrs[1]);
return?str.toString();
}
@Override
protected?String?getFile(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
String?file?=?attrs[2]+".pdf";
return?file;
}
public?static?void?main(String[]?args)?{
new?XXSBPaperCollector().run();
}
}
7、采集羊城晚報
/**
*?羊城晚報
*?@author?楊尚川
*/
public?class?YCWBPaperCollector?extends?AbstractPaperCollector{
private?static?final?String?paperName?=?"羊城晚報";
private?static?final?String?paperPath?=?"http://www.ycwb.com/ePaper/ycwb/";
private?static?final?String?url?=?paperPath+"html/";
private?static?final?String?hrefPrefix?=?paperPath+"images/";
private?static?final?String?start?=?"node_2081.htm";
private?static?final?String?pdfCssQuery?=?"html?body?div.cbody?div.areaL?div.box?div.conBox2?div?div.xx?h2?em?a.px12";
private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyy-MM/dd/");
@Override
public?List?collect(Date?date)?{
List?hrefs?=?new?ArrayList<>();
try?{
LOG.debug("url:?"+url);
String?paper?=?url?+?sf.format(date)?+?start;
LOG.debug("paper:?"+paper);
Document?document?=?Jsoup.connect(paper).get();
LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);
Elements?elements?=?document.select(pdfCssQuery);
for(Element?element?:?elements){
String?href?=?element.attr("href");
if(href?!=?null?&&?href.endsWith(".pdf")){
LOG.debug("報紙鏈接:"+href);
href?=?href.replace("../../../",?"");
LOG.debug("報紙鏈接:"+href);
hrefs.add(paperPath+href);
}else{
LOG.debug("不是報紙鏈接:"+href);
}
}
}?catch?(IOException?ex)?{
LOG.error("采集出錯",ex);
}
return?downloadPaper(hrefs);
}
@Override
protected?String?getPath(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
StringBuilder?str?=?new?StringBuilder();
str.append(paperName)
.append(File.separator)
.append(attrs[0])
.append(File.separator)
.append(attrs[1]);
return?str.toString();
}
@Override
protected?String?getFile(String?href)?{
String?path?=?href.replace(hrefPrefix,?"");
String[]?attrs?=?path.split("/");
String?file?=?attrs[2]+".pdf";
return?file;
}
public?static?void?main(String[]?args)?{
new?YCWBPaperCollector().run();
}
}
總結
以上是生活随笔為你收集整理的java电子报刊网站_采集电子报纸 - 杨尚川的个人页面 - OSCHINA - 中文开源技术交流社区...的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Android :约束布局Constra
- 下一篇: e^(πi)=-1的最佳解释笔记