當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

Lucene笔记一

發布時間：2025/5/22 编程问答 16 豆豆

生活随笔收集整理的這篇文章主要介紹了 Lucene笔记一小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

Lucene就是一個全文檢索的工具，建立索引用的，類似于新華字典的目錄

這里使用的是lucene-4.4.0版本，入門代碼所需jar包如下圖所示（解壓lucene-4.4.0后的目錄）：

入門代碼：

import java.io.File; import java.io.IOException;import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test;/*8* luceneDemo* */ public class TestLucene {/*** 通過lucene 提供的api 對數據建立索引，indexWriter* @throws IOException * */@Testpublic void testAdd() throws IOException{//索引在硬盤上面存放的位置..Directory directory=FSDirectory.open(new File("D:/INDEX"));//lucene 當前使用的版本...Version matchVersion=Version.LUCENE_44;//分詞器...(把一段文本分詞)（黑馬程序員是高端的培訓機構）//analzyer 是一個抽象類，具體的切分詞規則由子類實現...Analyzer analyzer=new StandardAnalyzer(matchVersion);IndexWriterConfig config=new IndexWriterConfig(matchVersion, analyzer);//構造索引寫入的對象..IndexWriter indexWriter=new IndexWriter(directory, config);//往索引庫里面寫數據..//索引庫里面的數據都是document 一個document相當于是一條記錄//這個document里面的數據相當于索引結構..Document document=new Document();IndexableField indexableField=new IntField("id",1, Store.YES);IndexableField stringfield=new StringField("title","對王召廷的個人評價",Store.YES);IndexableField teIndexableField=new TextField("content","風流倜儻有點黃",Store.YES);document.add(indexableField);document.add(stringfield);document.add(teIndexableField);//索引庫里面接收的數據都是document對象 indexWriter.addDocument(document);indexWriter.close();}/*** 對建立的索引進行搜索...* 通過indexSearcher 去搜索...* @throws IOException */@Testpublic void testSearcher() throws IOException{//索引在硬盤上面存放的位置..Directory directory=FSDirectory.open(new File("D:/INDEX"));//把索引目錄里面的索引讀取到IndexReader 當中...IndexReader indexReader=DirectoryReader.open(directory); // /構造搜索索引的對象..IndexSearcher indexSearcher=new IndexSearcher(indexReader);//Query 它是一個查詢條件對象，它是一個抽象類，不同的查詢規則就構造不同的子類...Query query=new TermQuery(new Term("title", "對王召廷的個人評價"));//檢索符合query 條件的前面N 條記錄..// TopDocs topDocs=indexSearcher.search(query, 10);//返回總記錄數... System.out.println(topDocs.totalHits);//存放的都是document 的idScoreDoc scoreDocs []=topDocs.scoreDocs;for(ScoreDoc scoreDoc:scoreDocs){//返回的就是document idint docID=scoreDoc.doc;//我還需要根據id 檢索到對應的documentDocument document=indexSearcher.doc(docID);System.out.println("id=="+document.get("id"));System.out.println("title=="+document.get("title"));System.out.println("content=="+document.get("content"));}}}

原理分析圖：

demo演示：?

根據入門代碼流程提煉工具類代碼：

bean：

package cn.itcast.bean;public class Article {private int id;public int getId() {return id;}public void setId(int id) {this.id = id;}public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public String getContent() {return content;}public void setContent(String content) {this.content = content;}public String getAuthor() {return author;}public void setAuthor(String author) {this.author = author;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}private String title;private String content;private String author;private String url;}

轉換工具類：

package cn.itcast.lucene;import org.apache.lucene.document.Document; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexableField;import cn.itcast.bean.Article;/*8* 對象與索引庫document 之間的轉化* */ public class ArticleToDocument {public static Document articleToDocument(Article article){Document document=new Document();IntField idfield=new IntField("id", article.getId(), Store.YES);//StringField 對應的值不分詞，textField 分詞..TextField titleField=new TextField("title", article.getTitle(),Store.YES);TextField contentField=new TextField("content", article.getContent(),Store.YES);//修改這個字段對應的權重值，默認這個值為1f // contentField.setBoost(3f);StringField authorField=new StringField("author", article.getAuthor(), Store.YES);StringField urlField=new StringField("url", article.getUrl(), Store.YES);document.add(idfield);document.add(titleField);document.add(contentField);document.add(authorField);document.add(urlField);return document;}}

Dao層：

package cn.itcast.dao;import java.io.IOException;import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs;import cn.itcast.bean.Article; import cn.itcast.lucene.ArticleToDocument; import cn.itcast.uitls.LuceneUtils;/*** 使用lucene 的API 來操作索引庫..* @author Administrator**/ public class LuceneDao {public void addIndex(Article article) throws IOException{IndexWriter indexWriter=LuceneUtils.getIndexWriter();Document doc=ArticleToDocument.articleToDocument(article);indexWriter.addDocument(doc);indexWriter.close();}/*** 刪除符合條件的記錄...* @param fieldName* @param fieldValue* @throws IOException*/public void delIndex(String fieldName,String fieldValue) throws IOException{IndexWriter indexWriter=LuceneUtils.getIndexWriter();//一定要夢想，萬一實現了勒Term term=new Term(fieldName, fieldValue);indexWriter.deleteDocuments(term);indexWriter.close();}/*** * 更新* * update table set ? where condtion* @throws IOException * * */public void updateIndex(String fieldName,String fieldValue,Article article) throws IOException{IndexWriter indexWriter=LuceneUtils.getIndexWriter();/*** 1:term 設置更新的條件...* * 2:設置更新的內容的對象..* */Term term=new Term(fieldName,fieldValue);Document doc=ArticleToDocument.articleToDocument(article);/*** * 在lucene 里面是先刪除符合這個條件term 的記錄，在創建一個doc 記錄...* */indexWriter.updateDocument(term, doc);indexWriter.close();}/*** 0,10* 10,10* 20,10* @param keywords* @throws Exception*/public void findIndex(String keywords,int firstResult,int maxResult) throws Exception{IndexSearcher indexSearcher=LuceneUtils.getIndexSearcher();//第一個條件.. 單字段查詢... // Query query=new TermQuery(new Term("title","夢想"))//select * from table where fieldname="" or content="" String fields []={"title","content"};//第二種條件：使用查詢解析器，多字段。。。我們需要重新導入一個jar queryParser 的jar... 位置在lucene解壓后的queryparser文件夾下QueryParser queryParser=new MultiFieldQueryParser(LuceneUtils.getVersion(),fields,LuceneUtils.getAnalyzer());// /這個事一個條件..Query query=queryParser.parse(keywords);//query 它是一個查詢條件，query 是一個抽象類，不同的查詢規則構造部同的子類即可//檢索符合query 條件的前面N 條記錄...//檢索的是索引目錄... (總記錄數，socreDOC (docID))//使用lucene 提供的api 進行操作...TopDocs topDocs=indexSearcher.search(query,firstResult+maxResult); // /存放的是docIDScoreDoc scoreDocs []=topDocs.scoreDocs;//判斷:scoreDocs 的length (實際取出來的數量..) 與 firstResult+maxResult 的值取小值...//在java jdk 里面提供了一個apiint endResult=Math.min(scoreDocs.length, firstResult+maxResult);for(int i=firstResult;i<endResult;i++){ // /取出來的是docID,這個id 是lucene 自己來維護。int docID=scoreDocs[i].doc;Document document=indexSearcher.doc(docID);System.out.println("id==="+document.get("id"));System.out.println("title==="+document.get("title"));System.out.println("content==="+document.get("content"));System.out.println("url==="+document.get("url"));System.out.println("author==="+document.get("author"));}} }

測試類：

package cn.itcast.junit;import java.io.IOException;import org.junit.Test;import cn.itcast.bean.Article; import cn.itcast.dao.LuceneDao;/*** 測試luceneDao* @author Administrator**/ public class LuceneDaoTest {private LuceneDao luceneDao=new LuceneDao();@Testpublic void testCreate() throws IOException{for(int i=28;i<=28;i++){Article article=new Article();article.setId(i);article.setTitle("一定要夢想，萬一實現了勒");article.setContent("矯情我覺得這句話太矯情了矯情矯情矯情矯情矯情矯情");article.setUrl("http://www.tianmao.com");article.setAuthor("馬云");luceneDao.addIndex(article);}}@Testpublic void testsearcher() throws Exception{ // article.setTitle("一定要夢想，萬一實現了勒"); textfield 分詞標準分詞器 // article.setContent("我覺得這句話太矯情了"); textfield 分詞標準分詞器luceneDao.findIndex("夢想",20,10);}@Testpublic void testdelete() throws IOException{String fieldName="title";String fieldValue="定";luceneDao.delIndex(fieldName, fieldValue);}@Testpublic void testUpdate() throws IOException{String fieldName="title";String fieldValue="定";Article article=new Article();article.setId(9527);article.setTitle("一定要夢想，萬一實現了勒");article.setContent("我覺得這句話太矯情了");article.setUrl("http://www.tianmao.com");article.setAuthor("馬云");luceneDao.updateIndex(fieldName, fieldValue, article);}}

?分詞器的流程圖：

?關于分詞器，網上可以找到很多種類的分詞器配合Lucene使用，相關分詞規則查看對應說明。

舉例如下：

Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);//中文單字切分、英文按空格切分成單詞

Analyzer analyzer=new CJKAnalyzer(Version.LUCENE_44);//二分法分詞，中文相連的兩個詞作為一個索引

Analyzer analyzer=new IKAnalyzer();//第三方的分詞器，對中文支持較好，可以自定義分詞單詞與停用詞

索引庫優化

package cn.itcast.lucene;import java.io.File; import java.io.IOException;import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test;import cn.itcast.uitls.Constants;public class TestOptimise {/*8* 優化的第一種方式:通過 IndexWriterConfig 優化設置mergePolicy（合并策略）* * */public void testoptimise() throws IOException{Directory directory=FSDirectory.open(new File(Constants.URL));Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_44, analyzer);LogDocMergePolicy mergePolicy=new LogDocMergePolicy();/*** 當這個值越小，更少的內存會被運用當創建索引的時候，搜索的時候越快，創建的時候越慢。* 當這個值越大，更多的內存會被運用當創建索引的時候，搜索的時候越慢，創建的時候越快..* larger values >10* * 2<=smaller<=10* *///設置合并因子..mergePolicy.setMergeFactor(10); // /設置索引的合并策略.. config.setMergePolicy(mergePolicy);IndexWriter indexWriter=new IndexWriter(directory, config);}/*** 通過directory 去優化....* @throws IOException * */@Testpublic void testoptimise2() throws IOException{//現在的索引放在硬盤上面...Directory directory=FSDirectory.open(new File(Constants.URL)); // /通過這個對象吧directory 里面的數據讀取到directory1 里面來..IOContext ioContext=new IOContext();//相辦法吧directory 的索引讀取到內存當中來...Directory directory1=new RAMDirectory(directory,ioContext);IndexReader indexReader=DirectoryReader.open(directory1);IndexSearcher indexSearcher=new IndexSearcher(indexReader);Query query=new TermQuery(new Term("title", "想"));TopDocs topDocs=indexSearcher.search(query, 100);System.out.println(topDocs.totalHits);}/*** 索引文件越大，會影響檢索的速度.. (減少索引文件的大小)* * 1:排除停用詞..* */public void testoptimise3(){}/*** 將索引分目盤存放將數據歸類...* */public void testoptimise4(){} }

轉載于:https://www.cnblogs.com/lm970585581/p/9410322.html

總結

以上是生活随笔為你收集整理的Lucene笔记一的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。