Java---SimHash原理与实现
生活随笔
收集整理的這篇文章主要介紹了
Java---SimHash原理与实现
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
Java—SimHash原理與實現
SimHash 原理
原理鏈接
SimHash 實現
package GetSimilar;import java.io.BufferedReader; import java.io.FileInputStream; import java.io.InputStreamReader; import java.math.BigInteger; import java.util.HashMap; import java.util.Map;import org.ansj.domain.Result; import org.ansj.domain.Term; import org.ansj.recognition.impl.StopRecognition; import org.ansj.splitWord.analysis.ToAnalysis; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.safety.Whitelist;/*** 完成SimHash的計算,將每個詞向量的權重由idf來決定* * @author Ove**/ public class SimHash2 {private int hashbits = 64; // 分詞后的hash數;public Map<String, Double> IDF;public SimHash2(){IDF = getIDF();}public SimHash2(int hashbits) {this.hashbits = hashbits;}// 讀取本地文件,獲得對應詞的idf值public Map<String, Double> getIDF(){Map<String, Double> result = new HashMap<String, Double>();try {// 打開文件FileInputStream fis = new FileInputStream("D:/Document/data/Similar/idf.txt");BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));String line = "";while ((line = br.readLine()) != null) {String[] split = line.split("\t");result.put(split[0], Double.valueOf(split[1]));}// 關閉文件br.close();fis.close();} catch (Exception e) {e.printStackTrace();}return result;}/*** 清除html標簽* * @param content* @return*/private String cleanResume(String content) {// 若輸入為HTML,下面會過濾掉所有的HTML的tagcontent = Jsoup.clean(content, Whitelist.none());content = StringUtils.lowerCase(content);String[] strings = { " ", "\n", "\r", "\t", "\\r", "\\n", "\\t", " " };for (String s : strings) {content = content.replaceAll(s, "");}return content;}/*** 這個是對整個字符串進行hash計算* * @return*/private BigInteger simHash(String tokens) {tokens = cleanResume(tokens); // cleanResume 刪除一些特殊字符int[] v = new int[this.hashbits];Result ansjList = wordAnalyzer(tokens);// System.out.println(ansjList);// 標識該文檔中每個詞出現的次數Map<String, Integer> wordCount = new HashMap<String, Integer>();Integer count = 0;for (Term term : ansjList) {count = wordCount.get(term.getName());if (count == null) {wordCount.put(term.getName(), 1);} else {wordCount.put(term.getName(), count + 1);}}int len = wordCount.size();String word = "";for (Term term : ansjList) {word = term.getName(); // 分詞字符串// 2、將每一個分詞hash為一組固定長度的數列.比如 64bit 的一個整數.BigInteger t = this.hash(word);for (int i = 0; i < this.hashbits; i++) {BigInteger bitmask = new BigInteger("1").shiftLeft(i);// 3、建立一個長度為64的整數數組(假設要生成64位的數字指紋,也可以是其它數字),// 對每一個分詞hash后的數列進行判斷,如果是1000...1,那么數組的第一位和末尾一位加1,// 中間的62位減一,也就是說,逢1加1,逢0減1.一直到把所有的分詞hash數列全部判斷完畢.double tf = (double) wordCount.get(word) / len;if(IDF.get(word)==null) continue;Double weight = 100 * tf * IDF.get(word); // 添加權重,權重應改為出現次數,而不是根據詞性來指定。 // if(weight==null) continue;// if (wordCount.containsKey(word)) {// weight = wordCount.get(word);// }if (t.and(bitmask).signum() != 0) {// 這里是計算整個文檔的所有特征的向量和v[i] += weight;} else {v[i] -= weight;}}}BigInteger fingerprint = new BigInteger("0");for (int i = 0; i < this.hashbits; i++) {if (v[i] >= 0) {fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));}}return fingerprint;}/*** 對單個的分詞進行hash計算;* * @param source* @return*/private BigInteger hash(String source) {if (source == null || source.length() == 0) {return new BigInteger("0");} else {/*** 當sourece 的長度過短,會導致hash算法失效,因此需要對過短的詞補償*/while (source.length() < 3) {source = source + source.charAt(0);}char[] sourceArray = source.toCharArray();BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);BigInteger m = new BigInteger("1000003");BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));for (char item : sourceArray) {BigInteger temp = BigInteger.valueOf((long) item);x = x.multiply(m).xor(temp).and(mask);}x = x.xor(new BigInteger(String.valueOf(source.length())));if (x.equals(new BigInteger("-1"))) {x = new BigInteger("-2");}return x;}}/*** 計算海明距離,海明距離越小說明越相似;* * @param one* @param two* @return*/public int hammingDistance(String s1, String s2) {BigInteger one = this.simHash(s1);BigInteger two = this.simHash(s2);BigInteger m = new BigInteger("1").shiftLeft(this.hashbits).subtract(new BigInteger("1"));BigInteger x = one.xor(two).and(m);int tot = 0;while (x.signum() != 0) {tot += 1;x = x.and(x.subtract(new BigInteger("1")));}return tot;}/*** 計算文本相似度* * @param s1* @param s2* @return*/public double getSemblance(String s1, String s2) {double i = (double) this.hammingDistance(s1, s2);return 1 - i / this.hashbits;}/*** 對單行文本進行分詞,分詞后每個詞以空格相隔開* * @param line* @return*/public static Result wordAnalyzer(String line) {StopRecognition filter = new StopRecognition();filter.insertStopNatures("w"); // 過濾標點filter.insertStopNatures("null"); // 過濾空格filter.insertStopNatures("m"); // 過濾數詞,會將 半數 該詞當做是數詞進行過濾String[] stopWords = { "如圖所示", "的", "中", "下列", "說法", "正確", "是", "若", "為", "則", "在" };filter.insertStopWords(stopWords); // 過濾單個詞Result fliterContent = ToAnalysis.parse(line).recognition(filter);return fliterContent;}public static void main(String[] args) {String s1 = "如圖所示,電源電壓保持不變,閉合開關 S 0,滑動變阻器R的滑片向右移動的過程中,下列說法正確的是 A.閉合開關S,若甲、乙均為電壓表,則兩表示數均變小 B.斷開開關S,若甲、乙均為電流表,則兩表示數均變大 C.閉合開關S,若甲、乙均為電壓表,則甲示數不變,乙示數變大 D.斷開開關S,若甲、乙均為電流表,則乙示數不變,甲示數變小";String s2 = "小明今天去吃肯德基";SimHash ob = new SimHash();System.out.println(ob.getSemblance(s1, s2));// System.out.println(new BigInteger("1000003"));}}總結
以上是生活随笔為你收集整理的Java---SimHash原理与实现的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: php企业网站通讯录管理系统,EML企业
- 下一篇: oracle安装步骤缺少实例,oracl