Nutch爬虫解决页面相对路径问题
2019獨角獸企業重金招聘Python工程師標準>>>
修改LinkDb.java的map方法,對頁面取到的outlink進行解析
import com.sun.org.apache.xml.internal.utils.URI.MalformedURIException;
import com.sun.org.apache.xml.internal.utils.URI;
??? Inlinks inlinks = new Inlinks();
??? URI baseUri = new URI(fromUrl);
??? URI absoluteUri = null;
??? for (int i = 0; i < outlinks.length; i++) {
????? Outlink outlink = outlinks[i];
????? String toUrl = outlink.getToUrl();
????? if (ignoreInternalLinks) {
??????? String toHost = getHost(toUrl);
??????? if (toHost == null || toHost.equals(fromHost)) { // internal link
????????? continue;?????????????????????????????? // skip it
??????? }
????? }
????? if (urlNormalizers != null) {
??????? try {
????????? toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
??????? } catch (Exception e) {
????????? LOG.warn("Skipping " + toUrl + ":" + e);
????????? toUrl = null;
??????? }
????? }
????? if (toUrl != null && urlFilters != null) {
??????? try {
????????? toUrl = urlFilters.filter(toUrl); // filter the url
??????? } catch (Exception e) {
????????? LOG.warn("Skipping " + toUrl + ":" + e);
????????? toUrl = null;
??????? }
????? }
????? if (toUrl == null) continue;
????? inlinks.clear();
????? String anchor = outlink.getAnchor();??????? // truncate long anchors
????? if (anchor.length() > maxAnchorLength) {
??????? anchor = anchor.substring(0, maxAnchorLength);
????? }
????? inlinks.add(new Inlink(fromUrl, anchor));?? // collect inverted link
????? try {
?? ??? ?? absoluteUri = new URI(baseUri, toUrl);
????? } catch (MalformedURIException e) {
?? ??? ?? continue;
????? }
? ??
//????? output.collect(new Text(toUrl), inlinks);
?? ?? output.collect(new Text(absoluteUri.toString()), inlinks);
??? }
還有一種方法
import java.net.*; ?
import java.io.*; ?
public class Test{ ?
public static void main(String args[]) throws Exception { ?
? String abURL=null;?? ?
?URI base=new URI("http://www.pep.com.cn/xe/jszx/tbjxzy/pepxe/pepsa/dzkb/200703/t20070308_303223.htm");//基本網頁URI??? ?
? URI abs=base.resolve("../../../pepwa/dzkb/200703/W020070308571116931595.jpg");//解析于上述網頁的相對URL,得到絕對URI??? ?
??????? URL absURL=abs.toURL();//轉成URL??? ?
??????? System.out.println(absURL);?? ?
??????? abURL = absURL.toString();?? ?
} ?
}
轉載于:https://my.oschina.net/junfrank/blog/286894
總結
以上是生活随笔為你收集整理的Nutch爬虫解决页面相对路径问题的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: oracle之报错:ORA-00054:
- 下一篇: WindDbug应用