大数据项目(一)————生成团购标签
1、項目簡介
所謂的團購標簽生成就是通過從消費者對商品的評價中統計提取關鍵詞,并統計其消費者對該商品該指標的累積數量,并進行排序顯示,類似于淘寶、美團等電商平臺都有的大家印象之類的。
2、業務介紹
1、從復雜的json數據格式中提取出評論標簽項
2、統計每個評論標簽項的數量
3、對統計出的評論標簽按降序排序
4、回顯標簽
3、功能實現
3.1 提取評論標簽項
ReviewTags.java
package cn.ctgu.taggen;import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject;public class ReviewTags {public static String extractTags(String jsonStr){JSONObject object= JSON.parseObject(jsonStr);//將json格式轉成對象if(object==null||!object.containsKey("extInfoList")){return "";}JSONArray array=object.getJSONArray("extInfoList");//提取出key對應的value,并轉成數組,即提取出extInfoList中的兩個valueif(array==null){return "";}StringBuilder sb=new StringBuilder();for(int i=0;i<array.size();i++){JSONObject obj=array.getJSONObject(i);if(obj!=null&&obj.containsKey("title")&&obj.getString("title").equals("contentTags")&&obj.containsKey("values")){JSONArray arr=obj.getJSONArray("values");//提取出values key對應的value并轉成一個數組if(arr==null){continue;}boolean begin=true;for(int j=0;j<arr.size();j++){if(begin){begin=false;}else{sb.append(",");}sb.append(arr.getString(j));}}}return sb.toString();}/** 結果:** 回頭客,上菜快,環境優雅,性價比高,菜品不錯*** 函數功能:從字符串中提取出相應的信息* */public static void main(String[] args) {String s = "{\"reviewPics\":[{\"picId\":2405538806,\"url\":\"http://p0.where.net/shaitu/7c10019c62947d01ded80cc698c77c90217708.jpg\",\"status\":1},{\"picId\":2405442602,\"url\":\"http://p0.meituan.net/shaitu/d41ef06f5d16d5d3cbc871765ff93130270451.jpg\",\"status\":1}],\"extInfoList\":[{\"title\":\"contentTags\",\"values\":[\"回頭客\",\"上菜快\",\"環境優雅\",\"性價比高\",\"菜品不錯\"],\"desc\":\"\",\"defineType\":0},{\"title\":\"tagIds\",\"values\":[\"493\",\"232\",\"24\",\"300\",\"1\"],\"desc\":\"\",\"defineType\":0}],\"expenseList\":null,\"reviewIndexes\":[1,2],\"scoreList\":null}";System.out.println(extractTags(s));System.out.println(extractTags(""));System.out.println(extractTags(null));}}3.2 Java版實現標簽統計排序處理
Tuple2Comparator.java
package cn.ctgu.taggen;import scala.Tuple2;import java.util.Comparator;/*** Created by Administrator on 2017/5/12.*/ public class Tuple2Comparator implements Comparator<Tuple2<String,Integer>>{public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {return o2._2() - o1._2() ;} }TagGeneratorJava.java
package cn.ctgu.taggen;import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2;import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.TreeSet;public class TagGeneratorJava {public static void main(String[] args) {SparkConf conf=new SparkConf();conf.setAppName("Gennerator");conf.setMaster("local[4]");JavaSparkContext sc=new JavaSparkContext(conf);JavaRDD<String>rdd1=sc.textFile("F:\\徐培成——spark\\線路三\\3-項目-團購網站標簽生成\\團購網站標簽生成\\temptags.txt");//切割JavaRDD<String[]>rdd2=rdd1.map(new Function<String, String[]>() {public String[] call(String v1) throws Exception {return v1.split("\t");}});//過濾JavaRDD<String[]>rdd3=rdd2.filter(new Function<String[], Boolean>() {public Boolean call(String[] v2) throws Exception {return v2.length==2;}});//變換數組,12345->味道好,上菜快 JavaPairRDD是對元組的封裝JavaPairRDD<String,String>rdd4=rdd3.mapToPair(new PairFunction<String[], String, String>() {public Tuple2<String, String> call(String[] v3) throws Exception {return new Tuple2<String, String>(v3[0],ReviewTags.extractTags(v3[1]));}});//過濾店家的有效評論JavaPairRDD<String,String>rdd5=rdd4.filter(new Function<Tuple2<String, String>, Boolean>() {public Boolean call(Tuple2<String, String> v5) throws Exception {return v5._2().length()>0;}});//將評論切割成數組JavaPairRDD<String,String[]>rdd6=rdd5.mapToPair(new PairFunction<Tuple2<String, String>, String, String[]>() {public Tuple2<String, String[]> call(Tuple2<String, String> v6) throws Exception {return new Tuple2<String, String[]>(v6._1(),v6._2().split(","));}});//壓扁,12345->味道好 12345->上菜快JavaPairRDD<String,String>rdd7=rdd6.flatMapValues(new Function<String[], Iterable<String>>() {public Iterable<String> call(String[] v7) throws Exception {List<String> list=new ArrayList<String>();for (String s:v7){list.add(s);}return list;}});//標1成對JavaPairRDD<Tuple2<String,String>,Integer>rdd8=rdd7.mapToPair(new PairFunction<Tuple2<String, String>, Tuple2<String, String>, Integer>() {public Tuple2<Tuple2<String, String>, Integer> call(Tuple2<String, String> v8) throws Exception {return new Tuple2<Tuple2<String, String>, Integer>(v8,1);}});//聚合(12345->味道好)->30 (12345->上菜快)->80JavaPairRDD<Tuple2<String,String>,Integer>rdd9=rdd8.reduceByKey(new Function2<Integer, Integer, Integer>() {public Integer call(Integer v1, Integer v2) throws Exception {return v1+v2;}});//(12345,(味道好->30),12345,(上菜快->80)JavaPairRDD<String,Tuple2<String,Integer>>rdd10=rdd9.mapToPair(new PairFunction<Tuple2<Tuple2<String, String>, Integer>, String, Tuple2<String, Integer>>() {public Tuple2<String, Tuple2<String, Integer>> call(Tuple2<Tuple2<String, String>, Integer> t) throws Exception {return new Tuple2<String, Tuple2<String, Integer>>(t._1()._1(),new Tuple2<String, Integer>(t._1()._2(),t._2()));}});//變換value成集合,以備聚合JavaPairRDD<String,List<Tuple2<String,Integer>>>rdd11=rdd10.mapToPair(new PairFunction<Tuple2<String, Tuple2<String, Integer>>, String, List<Tuple2<String, Integer>>>() {public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<String, Tuple2<String, Integer>> t) throws Exception {List<Tuple2<String,Integer>>list=new ArrayList<Tuple2<String, Integer>>();list.add(t._2());return new Tuple2<String, List<Tuple2<String, Integer>>>(t._1(),list);}});//聚合JavaPairRDD<String,List<Tuple2<String,Integer>>>rdd12=rdd11.reduceByKey(new Function2<List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>>() {public List<Tuple2<String, Integer>> call(List<Tuple2<String, Integer>> v1, List<Tuple2<String, Integer>> v2) throws Exception {v1.addAll(v2);return v1;}});//聚合//聚合12345->[// (->),// (->)// ]JavaPairRDD<String,String>rdd13=rdd12.mapToPair(new PairFunction<Tuple2<String, List<Tuple2<String, Integer>>>, String, String>() {public Tuple2<String, String> call(Tuple2<String, List<Tuple2<String, Integer>>> t) throws Exception {//TreeSet具有排序功能,通過實現對比器接口TreeSet<Tuple2<String, Integer>> ts = new TreeSet<Tuple2<String, Integer>>(new Tuple2Comparator());ts.addAll(t._2());Iterator<Tuple2<String, Integer>> it = ts.iterator() ;int index = 0 ;String str = "" ;//取前10while(it.hasNext()){if(index > 9){break ;}Tuple2<String,Integer> t0 = it.next();str = str + t0._1() + ":" + t0._2() + "," ;index ++ ;}str = str.substring(0,str.length() - 1) ;//把最后一個","去除return new Tuple2<String, String>(t._1(),str) ;}});List<Tuple2<String,String>>data=rdd13.collect();for (Tuple2<String,String>tt:data){System.out.println(tt._1()+"==>"+tt._2());}} }3.3 Scala實現標簽生成排序
import cn.ctgu.taggen.ReviewTags import org.apache.spark.{SparkConf, SparkContext}object TagGenerator{def main(args: Array[String]): Unit = {val conf=new SparkConf()conf.setAppName("TagGenerator by ***")conf.setMaster("local[4]")val sc=new SparkContext(conf)val poi_tags=sc.textFile("F:\\徐培成——spark\\線路三\\3-項目-團購網站標簽生成\\團購網站標簽生成\\temptags.txt")val poi_taglist=poi_tags.map(e=>e.split("\t")).filter(e=>e.length==2)//這是個函數映射,e(0)是key,后面的是value,最終結果為類似:(7789,"a,b,c")的元組,實際為:77287793 -> 音響效果好,干凈衛生,服務熱情.map(e=>e(0)->ReviewTags.extractTags(e(1)))//過濾評論串不為0.filter(e=>e._2.length>0)//映射成一個數組:77287793 -> [音響效果好,干凈衛生,服務熱情].map(e=>e._1->e._2.split(","))//壓扁操作,變為:77287793 -> 音響效果好 , 77287793->干凈衛生,77287793->服務熱情.flatMapValues(e=>e)//映射,元組到1的映射,(77287793,音響效果好)->1,(77287793,干凈衛生)->1,(77287793,服務熱情)->1.map(e=>(e._1,e._2)->1)//按key聚合,結果為:(77287793,音響效果好)->340.reduceByKey(_+_)//元組不能聚合,列表能聚合,所以這個位置是將其放在列表中,結果為:77287793->List(音響效果好,340).map(e=>e._1._1->List((e._1._2,e._2)))//將元組聚合到一個列表中,77287793->List((音響效果好,340),(干凈衛生,400),(..)).reduceByKey(_ ::: _)//將上面的結果按以下方式映射:將列表中的元素按第二個元素(也就是340、400...)進行倒序排序,并取出前10個//sortBy是按升序排的,reverse下就變成了降序.map(e=>e._1->e._2.sortBy(_._2).reverse.take(10)//對列表中的每一個元組進行變換,轉成字符串的形式//77287793->List(音響效果好:540,干凈衛生:400),(..),..)===>//77287793->音響效果好:540,干凈衛生:400.map(a=>a._1+":"+a._2.toString).mkString(","))poi_taglist.map(e=>e._1+"\t"+e._2).saveAsTextFile("file:///F:\\comp\\res.txt")}}pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>cn.ctgu</groupId><artifactId>JsonLogProcessModel</artifactId><version>1.0-SNAPSHOT</version><packaging>jar</packaging><dependencies><dependency><groupId>com.alibaba</groupId><artifactId>fastjson</artifactId><version>1.2.24</version></dependency><dependency><groupId>org.apache.spark</groupId><artifactId>spark-core_2.11</artifactId><version>2.1.0</version></dependency></dependencies></project>4、技術及難點總結
1、對數據的清洗
2、復雜標簽數據的提取中采用了fastjson技術
3、標簽統計、排序過程中主要涉及了復雜的spark RDD一些算子和transformation操作
5、該過程包含以下算子的操作:RDD Map、Filter、mapToPair、flatMapValues、ReduceByKey等操作。
6、解決了數據傾斜問題。
難點:數據比較復雜,清洗過程相對麻煩;RDD排序操作是采用TreeSet集合,實現Comparator接口來達到排序的要求;由于某個熱門商品的評論量過大,導致了數據傾斜問題,通過對key增加一個隨機數來解決數據傾斜,讓不同的key分配到不同的partitions上,然后對每個partitions上的數據做一次聚合,從而達到緩解數據傾斜問題。
總結
以上是生活随笔為你收集整理的大数据项目(一)————生成团购标签的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 用JavaScript实现的一些计算公式
- 下一篇: 蓝宝石压力传感器工作原理与优势