Storm【实践系列-如何写一个爬虫】 - ParserBolt
生活随笔
收集整理的這篇文章主要介紹了
Storm【实践系列-如何写一个爬虫】 - ParserBolt
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
2019獨角獸企業重金招聘Python工程師標準>>>
閱讀背景: 如果您對爬蟲,或則web前端不夠了解,請自行google。
代碼前提:您需要參閱本ID 所寫的前面兩篇博文:? Storm【實踐系列-如何寫一個爬蟲】 - Fetcher
本章主題: ParserBolt 如何完成的解析,并且如何從前面的組件得到數據,并emit出去。
博文流程: ?博文將整個 爬蟲系列公開,其過程為:?
????????????1 : 代碼實現。
???????? ? ?2 : 對代碼的細節進行解析。
? ? ? ? ? ? 3 : 對真個設計進行回顧,并作總結。
? ? ? ? ? ? 如果您在參看本ID的博文的過程之中,只存在流程 1。那么請繼續等待。一旦公司業務不處于飽和階段。
本ID將保證每日一篇。
package?com.digitalpebble.storm.crawler.bolt.parser;import?java.io.ByteArrayInputStream; import?java.io.IOException; import?java.net.MalformedURLException; import?java.net.URL; import?java.util.ArrayList; import?java.util.HashMap; import?java.util.HashSet; import?java.util.List; import?java.util.Map; import?java.util.Set;import?org.apache.commons.lang.StringUtils; import?org.apache.tika.Tika; import?org.apache.tika.metadata.Metadata; import?org.apache.tika.parser.ParseContext; import?org.apache.tika.sax.BodyContentHandler; import?org.apache.tika.sax.Link; import?org.apache.tika.sax.LinkContentHandler; import?org.apache.tika.sax.TeeContentHandler; import?org.slf4j.LoggerFactory; import?org.xml.sax.ContentHandler;import?backtype.storm.task.OutputCollector; import?backtype.storm.task.TopologyContext; import?backtype.storm.topology.OutputFieldsDeclarer; import?backtype.storm.topology.base.BaseRichBolt; import?backtype.storm.tuple.Fields; import?backtype.storm.tuple.Tuple; import?backtype.storm.tuple.Values;import?com.codahale.metrics.Timer; import?com.digitalpebble.storm.crawler.StormConfiguration; import?com.digitalpebble.storm.crawler.filtering.URLFilters; import?com.digitalpebble.storm.crawler.util.Configuration; import?com.digitalpebble.storm.crawler.util.HistogramMetric; import?com.digitalpebble.storm.crawler.util.MeterMetric; import?com.digitalpebble.storm.crawler.util.TimerMetric; import?com.digitalpebble.storm.crawler.util.URLUtil;/***?Uses?Tika?to?parse?the?output?of?a?fetch?and?extract?text?+?metadata***/@SuppressWarnings("serial") public?class?ParserBolt?extends?BaseRichBolt?{private?Configuration?config;private?Tika?tika;private?URLFilters?filters?=?null;private?OutputCollector?collector;private?static?final?org.slf4j.Logger?LOG?=?LoggerFactory.getLogger(ParserBolt.class);private?MeterMetric?eventMeters;private?HistogramMetric?eventHistograms;private?TimerMetric?eventTimers;private?boolean?ignoreOutsideHost?=?false;private?boolean?ignoreOutsideDomain?=?false;public?void?prepare(Map?conf,?TopologyContext?context,OutputCollector?collector)?{config?=?StormConfiguration.create();String?urlconfigfile?=?config.get("urlfilters.config.file","urlfilters.json");if?(urlconfigfile?!=?null)try?{filters?=?new?URLFilters(urlconfigfile);}?catch?(IOException?e)?{LOG.error("Exception?caught?while?loading?the?URLFilters");}ignoreOutsideHost?=?config.getBoolean("parser.ignore.outlinks.outside.host",?false);ignoreOutsideDomain?=?config.getBoolean("parser.ignore.outlinks.outside.domain",?false);//?instanciate?Tikalong?start?=?System.currentTimeMillis();tika?=?new?Tika();long?end?=?System.currentTimeMillis();LOG.debug("Tika?loaded?in?"?+?(end?-?start)?+?"?msec");this.collector?=?collector;this.eventMeters?=?context.registerMetric("parser-meter",new?MeterMetric(),?5);this.eventTimers?=?context.registerMetric("parser-timer",new?TimerMetric(),?5);this.eventHistograms?=?context.registerMetric("parser-histograms",new?HistogramMetric(),?5);}public?void?execute(Tuple?tuple)?{eventMeters.scope("tuple_in").mark();byte[]?content?=?tuple.getBinaryByField("content");eventHistograms.scope("content_bytes").update(content.length);String?url?=?tuple.getStringByField("url");HashMap<String,?String[]>?metadata?=?(HashMap<String,?String[]>)?tuple.getValueByField("metadata");//?TODO?check?status?etc...Timer.Context?timer?=?eventTimers.scope("parsing").time();//?rely?on?mime-type?provided?by?server?or?guess?ByteArrayInputStream?bais?=?new?ByteArrayInputStream(content);Metadata?md?=?new?Metadata();String?text?=?null;LinkContentHandler?linkHandler?=?new?LinkContentHandler();ContentHandler?textHandler?=?new?BodyContentHandler();TeeContentHandler?teeHandler?=?new?TeeContentHandler(linkHandler,textHandler);ParseContext?parseContext?=?new?ParseContext();//?parsetry?{tika.getParser().parse(bais,?teeHandler,?md,?parseContext);text?=?textHandler.toString();}?catch?(Exception?e)?{LOG.error("Exception?while?parsing?"?+?url,?e.getMessage());eventMeters.scope("error_content_parsing_"?+?e.getClass().getSimpleName()).mark();collector.fail(tuple);eventMeters.scope("tuple_fail").mark();return;}?finally?{try?{bais.close();}?catch?(IOException?e)?{LOG.error("Exception?while?closing?stream",?e);}}long?duration?=?timer.stop();LOG.info("Parsed?"?+?url?+?"?in?"?+?duration?+?"?msec");//?get?the?outlinks?and?convert?them?to?strings?(for?now)String?fromHost;URL?url_;try?{url_?=?new?URL(url);fromHost?=?url_.getHost().toLowerCase();}?catch?(MalformedURLException?e1)?{//?we?would?have?known?by?now?as?previous//?components?check?whether?the?URL?is?validLOG.error("MalformedURLException?on?"?+?url);eventMeters.scope("error_outlinks_parsing_"?+?e1.getClass().getSimpleName()).mark();collector.fail(tuple);eventMeters.scope("tuple_fail").mark();return;}List<Link>?links?=?linkHandler.getLinks();Set<String>?slinks?=?new?HashSet<String>(links.size());for?(Link?l?:?links)?{if?(StringUtils.isBlank(l.getUri()))continue;String?urlOL?=?null;try?{URL?tmpURL?=?URLUtil.resolveURL(url_,?l.getUri());urlOL?=?tmpURL.toExternalForm();}?catch?(MalformedURLException?e)?{LOG.debug("MalformedURLException?on?"?+?l.getUri());eventMeters.scope("error_out_link_parsing_"+?e.getClass().getSimpleName()).mark();continue;}//?filter?the?urlsif?(filters?!=?null)?{urlOL?=?filters.filter(urlOL);if?(urlOL?==?null)?{eventMeters.scope("outlink_filtered").mark();continue;}}if?(urlOL?!=?null?&&?ignoreOutsideHost)?{String?toHost;try?{toHost?=?new?URL(urlOL).getHost().toLowerCase();}?catch?(MalformedURLException?e)?{toHost?=?null;}if?(toHost?==?null?||?!toHost.equals(fromHost))?{urlOL?=?null;?//?skip?iteventMeters.scope("outlink_outsideHost").mark();continue;}}if?(urlOL?!=?null)?{slinks.add(urlOL);eventMeters.scope("outlink_kept").mark();}}//?add?parse?md?to?metadatafor?(String?k?:?md.names())?{//?TODO?handle?mutliple?valuesString[]?values?=?md.getValues(k);metadata.put("parse."?+?k,?values);}collector.emit(tuple,?new?Values(url,?content,?metadata,?text.trim(),?slinks));collector.ack(tuple);eventMeters.scope("tuple_success").mark();}public?void?declareOutputFields(OutputFieldsDeclarer?declarer)?{//?output?of?this?module?is?the?list?of?fields?to?index//?with?at?least?the?URL,?text?contentdeclarer.declare(new?Fields("url",?"content",?"metadata",?"text","outlinks"));}}轉載于:https://my.oschina.net/infiniteSpace/blog/303813
總結
以上是生活随笔為你收集整理的Storm【实践系列-如何写一个爬虫】 - ParserBolt的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 从面试题看技术学�
- 下一篇: vim一些挺方便的功能