Lucene笔记一
Lucene就是一個全文檢索的工具,建立索引用的,類似于新華字典的目錄
這里使用的是lucene-4.4.0版本,入門代碼所需jar包如下圖所示(解壓lucene-4.4.0后的目錄):
入門代碼:
import java.io.File; import java.io.IOException;import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test;/*8* luceneDemo* */ public class TestLucene {/*** 通過lucene 提供的api 對數據建立索引,indexWriter* @throws IOException * */@Testpublic void testAdd() throws IOException{//索引在硬盤上面存放的位置..Directory directory=FSDirectory.open(new File("D:/INDEX"));//lucene 當前使用的版本...Version matchVersion=Version.LUCENE_44;//分詞器...(把一段文本分詞)(黑馬程序員是高端的培訓機構)//analzyer 是一個抽象類,具體的切分詞規則由子類實現...Analyzer analyzer=new StandardAnalyzer(matchVersion);IndexWriterConfig config=new IndexWriterConfig(matchVersion, analyzer);//構造索引寫入的對象..IndexWriter indexWriter=new IndexWriter(directory, config);//往索引庫里面寫數據..//索引庫里面的數據都是document 一個document相當于是一條記錄//這個document里面的數據相當于索引結構..Document document=new Document();IndexableField indexableField=new IntField("id",1, Store.YES);IndexableField stringfield=new StringField("title","對王召廷的個人評價",Store.YES);IndexableField teIndexableField=new TextField("content","風流倜儻有點黃",Store.YES);document.add(indexableField);document.add(stringfield);document.add(teIndexableField);//索引庫里面接收的數據都是document對象 indexWriter.addDocument(document);indexWriter.close();}/*** 對建立的索引進行搜索...* 通過indexSearcher 去搜索...* @throws IOException */@Testpublic void testSearcher() throws IOException{//索引在硬盤上面存放的位置..Directory directory=FSDirectory.open(new File("D:/INDEX"));//把索引目錄里面的索引讀取到IndexReader 當中...IndexReader indexReader=DirectoryReader.open(directory); // /構造搜索索引的對象..IndexSearcher indexSearcher=new IndexSearcher(indexReader);//Query 它是一個查詢條件對象,它是一個抽象類,不同的查詢規則就構造不同的子類...Query query=new TermQuery(new Term("title", "對王召廷的個人評價"));//檢索符合query 條件的前面N 條記錄..// TopDocs topDocs=indexSearcher.search(query, 10);//返回總記錄數... System.out.println(topDocs.totalHits);//存放的都是document 的idScoreDoc scoreDocs []=topDocs.scoreDocs;for(ScoreDoc scoreDoc:scoreDocs){//返回的就是document idint docID=scoreDoc.doc;//我還需要根據id 檢索到對應的documentDocument document=indexSearcher.doc(docID);System.out.println("id=="+document.get("id"));System.out.println("title=="+document.get("title"));System.out.println("content=="+document.get("content"));}}}原理分析圖:
demo演示:?
根據入門代碼流程提煉工具類代碼:
import java.io.File; import java.io.IOException;import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version;/*** lucene 工具類...* @author Administrator**/ /*** 提煉規則,假設這段代碼可以完成一個功能,把這個代碼提煉到一個方法里面去,假設這個方法在某個業務羅繼承可以共用,那么往上抽取,* 假設在其它邏輯層也可以用,提煉到工具類里面去。* */ public class LuceneUtils {private static IndexWriter indexWriter=null;private static IndexSearcher indexSearcher=null;//索引存放目錄..private static Directory directory=null;private static IndexWriterConfig indexWriterConfig=null;private static Version version=null;private static Analyzer analyzer=null;static {try {directory=FSDirectory.open(new File(Constants.URL));version=Version.LUCENE_44;analyzer=new StandardAnalyzer(version);indexWriterConfig=new IndexWriterConfig(version, analyzer);} catch (IOException e) {e.printStackTrace();}}/*** * @return 返回用于操作索引的對象...* @throws IOException*/public static IndexWriter getIndexWriter() throws IOException{indexWriter=new IndexWriter(directory, indexWriterConfig);return indexWriter;}/*** 返回用于搜索索引的對象...* @return* @throws IOException */public static IndexSearcher getIndexSearcher() throws IOException{IndexReader indexReader=DirectoryReader.open(directory);indexSearcher=new IndexSearcher(indexReader);return indexSearcher;}/*** * 返回lucene 當前的版本...* @return*/public static Version getVersion() {return version;}/*** * 返回lucene 當前使用的分詞器..* @return*/public static Analyzer getAnalyzer() {return analyzer;}} public class Constants {/*** 索引存放的目錄*/public static final String URL="d:/indexdir/news"; }bean:
package cn.itcast.bean;public class Article {private int id;public int getId() {return id;}public void setId(int id) {this.id = id;}public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public String getContent() {return content;}public void setContent(String content) {this.content = content;}public String getAuthor() {return author;}public void setAuthor(String author) {this.author = author;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}private String title;private String content;private String author;private String url;}轉換工具類:
package cn.itcast.lucene;import org.apache.lucene.document.Document; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexableField;import cn.itcast.bean.Article;/*8* 對象與索引庫document 之間的轉化* */ public class ArticleToDocument {public static Document articleToDocument(Article article){Document document=new Document();IntField idfield=new IntField("id", article.getId(), Store.YES);//StringField 對應的值不分詞,textField 分詞..TextField titleField=new TextField("title", article.getTitle(),Store.YES);TextField contentField=new TextField("content", article.getContent(),Store.YES);//修改這個字段對應的權重值,默認這個值為1f // contentField.setBoost(3f);StringField authorField=new StringField("author", article.getAuthor(), Store.YES);StringField urlField=new StringField("url", article.getUrl(), Store.YES);document.add(idfield);document.add(titleField);document.add(contentField);document.add(authorField);document.add(urlField);return document;}}Dao層:
package cn.itcast.dao;import java.io.IOException;import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs;import cn.itcast.bean.Article; import cn.itcast.lucene.ArticleToDocument; import cn.itcast.uitls.LuceneUtils;/*** 使用lucene 的API 來操作索引庫..* @author Administrator**/ public class LuceneDao {public void addIndex(Article article) throws IOException{IndexWriter indexWriter=LuceneUtils.getIndexWriter();Document doc=ArticleToDocument.articleToDocument(article);indexWriter.addDocument(doc);indexWriter.close();}/*** 刪除符合條件的記錄...* @param fieldName* @param fieldValue* @throws IOException*/public void delIndex(String fieldName,String fieldValue) throws IOException{IndexWriter indexWriter=LuceneUtils.getIndexWriter();//一定要夢想,萬一實現了勒Term term=new Term(fieldName, fieldValue);indexWriter.deleteDocuments(term);indexWriter.close();}/*** * 更新* * update table set ? where condtion* @throws IOException * * */public void updateIndex(String fieldName,String fieldValue,Article article) throws IOException{IndexWriter indexWriter=LuceneUtils.getIndexWriter();/*** 1:term 設置更新的條件...* * 2:設置更新的內容的對象..* */Term term=new Term(fieldName,fieldValue);Document doc=ArticleToDocument.articleToDocument(article);/*** * 在lucene 里面是先刪除符合這個條件term 的記錄,在創建一個doc 記錄...* */indexWriter.updateDocument(term, doc);indexWriter.close();}/*** 0,10* 10,10* 20,10* @param keywords* @throws Exception*/public void findIndex(String keywords,int firstResult,int maxResult) throws Exception{IndexSearcher indexSearcher=LuceneUtils.getIndexSearcher();//第一個條件.. 單字段查詢... // Query query=new TermQuery(new Term("title","夢想"))//select * from table where fieldname="" or content="" String fields []={"title","content"};//第二種條件:使用查詢解析器,多字段。。。 我們需要重新導入一個jar queryParser 的jar... 位置在lucene解壓后的queryparser文件夾下QueryParser queryParser=new MultiFieldQueryParser(LuceneUtils.getVersion(),fields,LuceneUtils.getAnalyzer());// /這個事一個條件..Query query=queryParser.parse(keywords);//query 它是一個查詢條件,query 是一個抽象類,不同的查詢規則構造部同的子類即可//檢索符合query 條件的前面N 條記錄...//檢索的是索引目錄... (總記錄數,socreDOC (docID))//使用lucene 提供的api 進行操作...TopDocs topDocs=indexSearcher.search(query,firstResult+maxResult); // /存放的是docIDScoreDoc scoreDocs []=topDocs.scoreDocs;//判斷:scoreDocs 的length (實際取出來的數量..) 與 firstResult+maxResult 的值取小值...//在java jdk 里面提供了一個apiint endResult=Math.min(scoreDocs.length, firstResult+maxResult);for(int i=firstResult;i<endResult;i++){ // /取出來的是docID,這個id 是lucene 自己來維護。int docID=scoreDocs[i].doc;Document document=indexSearcher.doc(docID);System.out.println("id==="+document.get("id"));System.out.println("title==="+document.get("title"));System.out.println("content==="+document.get("content"));System.out.println("url==="+document.get("url"));System.out.println("author==="+document.get("author"));}} }測試類:
package cn.itcast.junit;import java.io.IOException;import org.junit.Test;import cn.itcast.bean.Article; import cn.itcast.dao.LuceneDao;/*** 測試luceneDao* @author Administrator**/ public class LuceneDaoTest {private LuceneDao luceneDao=new LuceneDao();@Testpublic void testCreate() throws IOException{for(int i=28;i<=28;i++){Article article=new Article();article.setId(i);article.setTitle("一定要夢想,萬一實現了勒");article.setContent("矯情我覺得這句話太矯情了矯情矯情矯情矯情矯情矯情");article.setUrl("http://www.tianmao.com");article.setAuthor("馬云");luceneDao.addIndex(article);}}@Testpublic void testsearcher() throws Exception{ // article.setTitle("一定要夢想,萬一實現了勒"); textfield 分詞 標準分詞器 // article.setContent("我覺得這句話太矯情了"); textfield 分詞 標準分詞器luceneDao.findIndex("夢想",20,10);}@Testpublic void testdelete() throws IOException{String fieldName="title";String fieldValue="定";luceneDao.delIndex(fieldName, fieldValue);}@Testpublic void testUpdate() throws IOException{String fieldName="title";String fieldValue="定";Article article=new Article();article.setId(9527);article.setTitle("一定要夢想,萬一實現了勒");article.setContent("我覺得這句話太矯情了");article.setUrl("http://www.tianmao.com");article.setAuthor("馬云");luceneDao.updateIndex(fieldName, fieldValue, article);}}?分詞器的流程圖:
?關于分詞器,網上可以找到很多種類的分詞器配合Lucene使用,相關分詞規則查看對應說明。
舉例如下:
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);//中文單字切分、英文按空格切分成單詞
Analyzer analyzer=new CJKAnalyzer(Version.LUCENE_44);//二分法分詞,中文相連的兩個詞作為一個索引
Analyzer analyzer=new IKAnalyzer();//第三方的分詞器,對中文支持較好,可以自定義分詞單詞與停用詞
?
索引庫優化
package cn.itcast.lucene;import java.io.File; import java.io.IOException;import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test;import cn.itcast.uitls.Constants;public class TestOptimise {/*8* 優化的第一種方式:通過 IndexWriterConfig 優化設置mergePolicy(合并策略)* * */public void testoptimise() throws IOException{Directory directory=FSDirectory.open(new File(Constants.URL));Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_44, analyzer);LogDocMergePolicy mergePolicy=new LogDocMergePolicy();/*** 當這個值越小,更少的內存會被運用當創建索引的時候,搜索的時候越快,創建的時候越慢。* 當這個值越大,更多的內存會被運用當創建索引的時候,搜索的時候越慢,創建的時候越快..* larger values >10* * 2<=smaller<=10* *///設置合并因子..mergePolicy.setMergeFactor(10); // /設置索引的合并策略.. config.setMergePolicy(mergePolicy);IndexWriter indexWriter=new IndexWriter(directory, config);}/*** 通過directory 去優化....* @throws IOException * */@Testpublic void testoptimise2() throws IOException{//現在的索引放在硬盤上面...Directory directory=FSDirectory.open(new File(Constants.URL)); // /通過這個對象吧directory 里面的數據讀取到directory1 里面來..IOContext ioContext=new IOContext();//相辦法吧directory 的索引讀取到內存當中來...Directory directory1=new RAMDirectory(directory,ioContext);IndexReader indexReader=DirectoryReader.open(directory1);IndexSearcher indexSearcher=new IndexSearcher(indexReader);Query query=new TermQuery(new Term("title", "想"));TopDocs topDocs=indexSearcher.search(query, 100);System.out.println(topDocs.totalHits);}/*** 索引文件越大,會影響檢索的速度.. (減少索引文件的大小)* * 1:排除停用詞..* */public void testoptimise3(){}/*** 將索引分目盤存放 將數據歸類...* */public void testoptimise4(){} }?
轉載于:https://www.cnblogs.com/lm970585581/p/9410322.html
總結
- 上一篇: return 的使用
- 下一篇: SpringBoot随笔