MapReduce基础开发之三字段处理并输出Hive表
生活随笔
收集整理的這篇文章主要介紹了
MapReduce基础开发之三字段处理并输出Hive表
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
1、MR設(shè)計和開發(fā)
???1)設(shè)計:
?????輸入:用戶名?|?數(shù)字ip?|?時間戳?|??url
?????MR處理:正則表達(dá)式匹配url,滿足則解析url并轉(zhuǎn)換ip和時間戳,
?????輸出:用戶名|點分制IP|日期時間|URL域名|URL參數(shù);
???2)開發(fā),見源碼;
?????測試文件:/tmp/fjs/in/testdata.txt
?????輸出文件:/tmp/fjs/out/part-r-00000.bz2
?????注意:開發(fā)中務(wù)必避免陷入兩個誤區(qū),一個是本地文件處理和hdfs文件處理不同;?另一個是Map和Reduce函數(shù)是在不同節(jié)點上執(zhí)行,參數(shù)要能共享;
???????????
2、Hive表:外部表tmp_fjs_adslurl,location=/tmp/fjs/out/
???1)建表腳本:hive>
?????CREATE?EXTERNAL?TABLE?`tmp_fjs_adslurl`(`account`?string,`ip`?string,`time_stamp`?string,`domain`?string,`para`?string)?
?????ROW?FORMAT?DELIMITED?FIELDS?TERMINATED?BY?'|'?
?????STORED?AS?INPUTFORMAT?'org.apache.hadoop.mapred.TextInputFormat'?
?????OUTPUTFORMAT?'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
?????LOCATION?'hdfs://nameservice-ha/tmp/fjs/out';
???2)查詢腳本:hive>select?count(*)?from?tmp_fjs_adslurl;?
3、代碼:
? ? 1)MapReduce代碼
? ? 其中URLLocation類解析URL提取域名和參數(shù)串采用字符串截取,可參考網(wǎng)上通用方法,如http://volunteer521.iteye.com/blog/1685942,這里是直接用了別人代碼,未經(jīng)授權(quán)不便發(fā)布。
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser;public class AdslUrl {public static class UrlMapper extends Mapper<Object, Text, Text, Text>{private Text oKey = new Text();private Text oValue=new Text();Util util=new Util();public void map(Object key, Text value, Context context)throws IOException, InterruptedException {String[] iValue=value.toString().split("\\|");//獲取行,并按照|分隔符提取oKey.set(iValue[0].trim());//用戶名作為輸出keyString ip=util.Long2IP(iValue[1].trim());//數(shù)字ip轉(zhuǎn)化為點分制String dt=util.timestamp2date(iValue[2].trim());//時間戳轉(zhuǎn)日期時間//解析url,提取域名和參數(shù)及其值if(util.regular(iValue[3])){//url正則匹配URLocation parser = new URLocation(" "+iValue[3].trim());//頭部加空格String domain=parser.getPath();//域名String para=parser.getQuery();//參數(shù)串,&分割oValue.set(ip+"|"+dt+"|"+domain+"|"+para);context.write(oKey, oValue);}} }public static class UrlReducer extends Reducer<Text,Text,Text,Text> {private Text oKey = new Text();public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {for (Text val : values) {String str=key.toString()+"|"+val.toString();oKey.set(str);context.write(oKey, new Text(""));}}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();if (otherArgs.length != 2) {System.err.println("Usage: AdslUrl <in> <out>");System.exit(2);}Job job = new Job(conf, "parse url");job.setJarByClass(AdslUrl.class);job.setNumReduceTasks(1);//設(shè)置reduce輸入文件一個,方便查看結(jié)果job.setMapperClass(UrlMapper.class);job.setCombinerClass(UrlReducer.class);job.setReducerClass(UrlReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);} }
2)Util類處理字段
import java.io.BufferedReader; import java.io.FileReader; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern;public class Util {//數(shù)字型IP轉(zhuǎn)為點分十進(jìn)制IPpublic String Long2IP(String _ipLong){StringBuffer ipInfo = new StringBuffer(); long mask[] = {0x000000FF,0x0000FF00,0x00FF0000,0xFF000000}; long num = 0; try{Long ipLong = Long.parseLong(_ipLong);ipInfo.setLength(0);for(int i=0;i<4;i++){ num = (ipLong & mask[i])>>(i*8); if(i>0) ipInfo.insert(0,"."); ipInfo.insert(0,Long.toString(num,10)); }return ipInfo.toString(); } catch (Exception ex){return "";}}//時間戳轉(zhuǎn)日期時間public String timestamp2date(String _timeStamp){String dateFormat = "yyyyMMddHHmmss";SimpleDateFormat fm = new SimpleDateFormat(dateFormat);;if (_timeStamp.equals("")){return "";}try{long timeStamp = Long.parseLong(_timeStamp);String dt = fm.format(new Date(timeStamp*1000));return dt;} catch (Exception ex){return "";}}//正則過濾域名和參數(shù)public boolean regular(String url){//正則表達(dá)式String regEx="baidu|sina|qq";Pattern pattern = Pattern.compile(regEx,Pattern.CASE_INSENSITIVE);Matcher matcher = pattern.matcher(url);if(matcher.find()) return true;return false;} }
???1)設(shè)計:
?????輸入:用戶名?|?數(shù)字ip?|?時間戳?|??url
?????MR處理:正則表達(dá)式匹配url,滿足則解析url并轉(zhuǎn)換ip和時間戳,
?????輸出:用戶名|點分制IP|日期時間|URL域名|URL參數(shù);
???2)開發(fā),見源碼;
?????測試文件:/tmp/fjs/in/testdata.txt
?????輸出文件:/tmp/fjs/out/part-r-00000.bz2
?????注意:開發(fā)中務(wù)必避免陷入兩個誤區(qū),一個是本地文件處理和hdfs文件處理不同;?另一個是Map和Reduce函數(shù)是在不同節(jié)點上執(zhí)行,參數(shù)要能共享;
???????????
2、Hive表:外部表tmp_fjs_adslurl,location=/tmp/fjs/out/
???1)建表腳本:hive>
?????CREATE?EXTERNAL?TABLE?`tmp_fjs_adslurl`(`account`?string,`ip`?string,`time_stamp`?string,`domain`?string,`para`?string)?
?????ROW?FORMAT?DELIMITED?FIELDS?TERMINATED?BY?'|'?
?????STORED?AS?INPUTFORMAT?'org.apache.hadoop.mapred.TextInputFormat'?
?????OUTPUTFORMAT?'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
?????LOCATION?'hdfs://nameservice-ha/tmp/fjs/out';
???2)查詢腳本:hive>select?count(*)?from?tmp_fjs_adslurl;?
3、代碼:
? ? 1)MapReduce代碼
? ? 其中URLLocation類解析URL提取域名和參數(shù)串采用字符串截取,可參考網(wǎng)上通用方法,如http://volunteer521.iteye.com/blog/1685942,這里是直接用了別人代碼,未經(jīng)授權(quán)不便發(fā)布。
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser;public class AdslUrl {public static class UrlMapper extends Mapper<Object, Text, Text, Text>{private Text oKey = new Text();private Text oValue=new Text();Util util=new Util();public void map(Object key, Text value, Context context)throws IOException, InterruptedException {String[] iValue=value.toString().split("\\|");//獲取行,并按照|分隔符提取oKey.set(iValue[0].trim());//用戶名作為輸出keyString ip=util.Long2IP(iValue[1].trim());//數(shù)字ip轉(zhuǎn)化為點分制String dt=util.timestamp2date(iValue[2].trim());//時間戳轉(zhuǎn)日期時間//解析url,提取域名和參數(shù)及其值if(util.regular(iValue[3])){//url正則匹配URLocation parser = new URLocation(" "+iValue[3].trim());//頭部加空格String domain=parser.getPath();//域名String para=parser.getQuery();//參數(shù)串,&分割oValue.set(ip+"|"+dt+"|"+domain+"|"+para);context.write(oKey, oValue);}} }public static class UrlReducer extends Reducer<Text,Text,Text,Text> {private Text oKey = new Text();public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {for (Text val : values) {String str=key.toString()+"|"+val.toString();oKey.set(str);context.write(oKey, new Text(""));}}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();if (otherArgs.length != 2) {System.err.println("Usage: AdslUrl <in> <out>");System.exit(2);}Job job = new Job(conf, "parse url");job.setJarByClass(AdslUrl.class);job.setNumReduceTasks(1);//設(shè)置reduce輸入文件一個,方便查看結(jié)果job.setMapperClass(UrlMapper.class);job.setCombinerClass(UrlReducer.class);job.setReducerClass(UrlReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);} }
2)Util類處理字段
import java.io.BufferedReader; import java.io.FileReader; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern;public class Util {//數(shù)字型IP轉(zhuǎn)為點分十進(jìn)制IPpublic String Long2IP(String _ipLong){StringBuffer ipInfo = new StringBuffer(); long mask[] = {0x000000FF,0x0000FF00,0x00FF0000,0xFF000000}; long num = 0; try{Long ipLong = Long.parseLong(_ipLong);ipInfo.setLength(0);for(int i=0;i<4;i++){ num = (ipLong & mask[i])>>(i*8); if(i>0) ipInfo.insert(0,"."); ipInfo.insert(0,Long.toString(num,10)); }return ipInfo.toString(); } catch (Exception ex){return "";}}//時間戳轉(zhuǎn)日期時間public String timestamp2date(String _timeStamp){String dateFormat = "yyyyMMddHHmmss";SimpleDateFormat fm = new SimpleDateFormat(dateFormat);;if (_timeStamp.equals("")){return "";}try{long timeStamp = Long.parseLong(_timeStamp);String dt = fm.format(new Date(timeStamp*1000));return dt;} catch (Exception ex){return "";}}//正則過濾域名和參數(shù)public boolean regular(String url){//正則表達(dá)式String regEx="baidu|sina|qq";Pattern pattern = Pattern.compile(regEx,Pattern.CASE_INSENSITIVE);Matcher matcher = pattern.matcher(url);if(matcher.find()) return true;return false;} }
總結(jié)
以上是生活随笔為你收集整理的MapReduce基础开发之三字段处理并输出Hive表的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Java正则表达式基础案例和语法
- 下一篇: MapReduce基础开发之四参数传递