java 中文分词算法,基于: lucene+IKAnalyzer - V2EX
V2EX = way to explore
V2EX 是一个关于分享和探索的地方
Sign Up Now
For Existing Member  Sign In
3023369823
V2EX    Java

java 中文分词算法,基于: lucene+IKAnalyzer

  •  1
     
  •   3023369823 Aug 22, 2016 5272 views
    This topic created in 3538 days ago, the information mentioned may be changed or developed.

    我想只要是学过数据库的孩纸,不管是 mysql ,还是 sqlsever ,一提到查找,本能的想到的便是 like 关键字,其实去转盘网分类模式)之前也是采用这种算法,但我可以告诉大家一个很不幸的事情, like 匹配其实会浪费大量的有用资源,原因这里不说了请自己想一想,我们还是直接摆事实验证。

    现在用去转盘网搜: hello 找个单词,如下:

    点击实验

    翻页你会发现只要是包含 hello 的单词都找到了,但是如果你用 like 的话是不会有这个效果的,不信让我们再看一下,还好他来说电影网的分词算法我还没来得及修改,还可以看到现象:

    点击实验

    你会发现只有开始包含 hello 这个字段的搜索串才能得到匹配,这就问题来了,数据库中大量的资源岂不是白白浪费了,不过没事,伟大的人类还是很聪明的,发明了分词,分词的原理我就不讲了,请自己百度吧,还是直接上代码,提示,这里需要四个 jar 包作为工具,我先上传的去转盘,想要做分词的请先下载:

    分词包下载地址 1

    < href="http://www.quzhuanpan.com/download/checkResult.action?id=34&type=6" rel="nofollow">分词包下载地址 2

    package com.tray.indexData; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.math.BigInteger; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import com.tray.bean.SerachResult; import com.tray.common.tools.DateFormater; public class LuceneSearch { private static String DISC_URL = "/home/indexData/data"; static { String os = System.getProperty("os.name"); if(os.toLowerCase().startsWith("win")){ DISC_URL = "E:\\indexData\\data"; } else{ DISC_URL ="/home/indexData/data"; } } //指定分词器 private Analyzer analyzer=new IKAnalyzer(); private static Directory directory; //配置 private static IndexWriterConfig iwConfig; //配置 IndexWriter private static IndexWriter writer; private static File indexFile = null; private static Version version = Version.LUCENE_36; private final int PAPGESIZE=10; /** * 全量索引 * @Author haoning */ public void init() throws Exception { try { indexFile = new File(DISC_URL); if (!indexFile.exists()) { indexFile.mkdir(); } directory=FSDirectory.open(indexFile); //配置 IndexWriterConfig iwCOnfig= new IndexWriterConfig(version,analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); //创建写索引对象 writer = new IndexWriter(directory,iwConfig); } catch (Exception e) { } } public void closeWriter(){ try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void commit(){ try { writer.commit(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * 一个一个索引 * @Author haoning */ public void singleIndex(Document doc) throws Exception { writer.addDocument(doc); } /** * 一个跟新 * @Author haoning */ public void singleUpdate(Document doc) throws Exception { Term term = new Term("url", doc.get("url")); writer.updateDocument(term,doc); } /** * 全量索引 * @Author haoning */ public void fullIndex(Document[] documentes) throws Exception { writer.deleteAll(); for (Document document : documentes) { writer.addDocument(document); } writer.commit(); } /** * 根据 id 删除索引 * @Author haoning */ public void deleteIndex(Document document)throws Exception{ Term term = new Term("url", document.get("url"));//url 才是唯一标志 writer.deleteDocuments(term); writer.commit(); } /** * 根据 id 增量索引 * @Author haoning */ public void updateIndex(Document[] documentes) throws Exception{ for (Document document : documentes) { Term term = new Term("url", document.get("url")); writer.updateDocument(term, document); } writer.commit(); } /** * 直接查询 * @Author haoning */ public void simpleSearch(String filedStr,String queryStr,int page, int pageSize) throws Exception{ File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(), false); Term term = new Term(filedStr, queryStr); Query query = new TermQuery(term); searcher.search(query, topCollector); ScoreDoc[] docs = topCollector.topDocs((page-1)*pageSize, pageSize).scoreDocs; printScoreDoc(docs, searcher); } /** * 高亮查询 * @Author haoning */ public Map<String, Object> highLightSearch(String filed,String keyWord,int curpage, int pageSize) throws Exception{ List<SerachResult> list=new ArrayList<SerachResult>(); Map<String,Object> map = new HashMap<String,Object>(); if (curpage <= 0) { curpage = 1; } if (pageSize <= 0 || pageSize>20) { pageSize = PAPGESIZE; } File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir);//根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir);//搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); int start = (curpage - 1) * pageSize; Analyzer analyzer = new IKAnalyzer(true); QueryParser queryParser = new QueryParser(Version.LUCENE_36, filed, analyzer); queryParser.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = queryParser.parse(keyWord); int hm = start + pageSize; TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); searcher.search(query, res); SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); long amount = res.getTotalHits(); //long pages = (rowCount - 1) / pageSize + 1; //计算总页数 map.put("amount",amount);//总共多少条记录 TopDocs tds = res.topDocs(start, pageSize); ScoreDoc[] sd = tds.scoreDocs; for (int i = 0; i < sd.length; i++) { Document doc = searcher.doc(sd[i].doc); String temp=doc.get("name"); //做高亮处理 TokenStream ts = analyzer.tokenStream("name", new StringReader(temp)); SerachResult record=new SerachResult(); String name = highlighter.getBestFragment(ts,temp); String skydirverName=doc.get("skydirverName"); String username=doc.get("username"); String shareTime=doc.get("shareTime"); String describ=doc.get("describ"); String typeId=doc.get("typeId"); String id=doc.get("id"); String url=doc.get("url"); record.setName(name); record.setSkydriverName(skydirverName); record.setUsername(username); record.setShareTime(DateFormater.getFormatDate(shareTime,"yyyy-MM-dd HH:mm:ss")); record.setDescrib(describ); record.setTypeId(Integer.parseInt(typeId)); record.setId(new BigInteger(id)); record.setUrl(url); list.add(record); /*System.out.println("name:"+name); System.out.println("skydirverName:"+skydirverName); System.out.println("username:"+username); System.out.println("shareTime:"+shareTime); System.out.println("describ:"+describ); System.out.println("typeId:"+typeId); System.out.println("id:"+id); System.out.println("url:"+url);*/ } map.put("source",list); return map; } /** * 根据前缀查询 * @Author haoning */ public void prefixSearch(String filedStr,String queryStr) throws Exception{ File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); Term term = new Term(filedStr, queryStr); Query query = new PrefixQuery(term); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } /** * 通配符查询 * @Author haoning */ public void wildcardSearch(String filedStr,String queryStr) throws Exception{ File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); Term term = new Term(filedStr, queryStr); Query query = new WildcardQuery(term); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } /** * 分词查询 * @Author haoning */ public void analyzerSearch(String filedStr,String queryStr) throws Exception{ File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); QueryParser queryParser = new QueryParser(version, filedStr, analyzer); Query query = queryParser.parse(queryStr); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } /** * 多属性分词查询 * @Author haoning */ public void multiAnalyzerSearch(String[] filedStr,String queryStr) throws Exception{ File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); QueryParser queryParser = new MultiFieldQueryParser(version, filedStr, analyzer); Query query = queryParser.parse(queryStr); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } public void printScoreDoc(ScoreDoc[] docs,IndexSearcher searcher)throws Exception{ for (int i = 0; i < docs.length; i++) { List<Fieldable> list = searcher.doc(docs[i].doc).getFields(); for (Fieldable fieldable : list) { String fieldName = fieldable.name(); String fieldValue = fieldable.stringValue(); System.out.println(fieldName+" : "+fieldValue); } } } } 

    注意由于去转盘网是部署到 linux 上的,所以 DISC_URL 可以更具系统变换,我是通过 url 来判定索引文件是否唯一的,你可以更具 id 来判断,具体情况具体对待吧。 注:这是楼主在 V2EX 上发的第二篇技术贴,可惜还是受字数限制只能写一半,老规矩,要看去博客园看吧,链接如下:http://www.cnblogs.com/huangxie/p/5473273.html 你可以在一个 applicationic 程序中开始索引,也可以写个定时器来定时索引,看需求。以上代码是楼主幸苦的作品,转载请不要改动,本人确保代码完全可用。本人建个 qq 群,欢迎大家一起交流技术, 群号: 512245829 喜欢微博的朋友关注:转盘娱乐即可

    16 replies    2016-08-23 20:34:58 +08:00
    shoumu
        1
    shoumu  
       Aug 22, 2016
    你在做分词呢,还是在做检索。。。
    3023369823
        2
    3023369823  
    OP
       Aug 22, 2016
    @shoumu 检索的前提是分词,哈哈,都做了,看来是个行家,顶你
    EPr2hh6LADQWqRVH
        3
    EPr2hh6LADQWqRVH  
       Aug 22, 2016
    IK 我记得基本就没有算法可言。。就是查词典,仅此而已。。。
    基于 CRF 的 ansj 是相当有水平的,
    jieba 里面有不知道从哪搞到的 IDF 信息,
    lz 可以考虑替换一下。
    elepant
        4
    elepant  
       Aug 22, 2016
    代码放 GitHub 上吧
    3023369823
        5
    3023369823  
    OP
       Aug 22, 2016
    @avastms 你说的对,说算法是有点牵强了,但是名字是改不了了,就先这样吧,谢谢你的指正
    3023369823
        6
    3023369823  
    OP
       Aug 22, 2016
    @Asan 好的,有时间弄下
    Lucups
        7
    Lucups  
       Aug 22, 2016
    我觉得基于统计的中文分词云服务才是未来啊
    3023369823
        8
    3023369823  
    OP
       Aug 22, 2016
    @Lucups 不是一般人能搞的,哈哈,看看人家百度, google 是否放接口
    jason19659
        9
    jason19659  
       Aug 22, 2016
    .....
    hinkal
        10
    hinkal  
       Aug 22, 2016 via Android
    斯坦福中文分词不是开源吗
    3023369823
        11
    3023369823  
    OP
       Aug 22, 2016
    @hinkal lucene 就是开源的
    tausi0661
        12
    tausi0661  
       Aug 23, 2016
    @3023369823 lucene 是索引框架不是中文切词的. 你用的 IK 才是切词. 很多年前还用过一个叫庖丁解牛的:D 可惜和 IK 一样貌似已经不维护了.
    http://git.oschina.net/zhzhenqin/paoding-analysis
    3023369823
        13
    3023369823  
    OP
       Aug 23, 2016
    你说的没错,是这样的
    3023369823
        14
    3023369823  
    OP
       Aug 23, 2016
    @tausi0661 你说的没错,是这样的
    hantsy
        15
    hantsy  
       Aug 23, 2016
    IKAnalyzer 以前用过一次,一样的中文可以分出不同结果。结果不用了,还是直接中文关键字匹配就行了。
    3023369823
        16
    3023369823  
    OP
       Aug 23, 2016
    @hantsy 中文关键字匹配很多匹配不到,你用的是不旧版本,[去转盘]( http://www.quzhuanpan.com),效果很不错嘛
    About     Help     Advertise     Blog     API     FAQ     Solana     868 Online   Highest 6679       Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 68ms UTC 22:24 PVG 06:24 LAX 15:24 JFK 18:24
    Do have faith in what you're doing.
    ubao msn snddm index pchome yahoo rakuten mypaper meadowduck bidyahoo youbao zxmzxm asda bnvcg cvbfg dfscv mmhjk xxddc yybgb zznbn ccubao uaitu acv GXCV ET GDG YH FG BCVB FJFH CBRE CBC GDG ET54 WRWR RWER WREW WRWER RWER SDG EW SF DSFSF fbbs ubao fhd dfg ewr dg df ewwr ewwr et ruyut utut dfg fgd gdfgt etg dfgt dfgd ert4 gd fgg wr 235 wer3 we vsdf sdf gdf ert xcv sdf rwer hfd dfg cvb rwf afb dfh jgh bmn lgh rty gfds cxv xcv xcs vdas fdf fgd cv sdf tert sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf shasha9178 shasha9178 shasha9178 shasha9178 shasha9178 liflif2 liflif2 liflif2 liflif2 liflif2 liblib3 liblib3 liblib3 liblib3 liblib3 zhazha444 zhazha444 zhazha444 zhazha444 zhazha444 dende5 dende denden denden2 denden21 fenfen9 fenf619 fen619 fenfe9 fe619 sdf sdf sdf sdf sdf zhazh90 zhazh0 zhaa50 zha90 zh590 zho zhoz zhozh zhozho zhozho2 lislis lls95 lili95 lils5 liss9 sdf0ty987 sdft876 sdft9876 sdf09876 sd0t9876 sdf0ty98 sdf0976 sdf0ty986 sdf0ty96 sdf0t76 sdf0876 df0ty98 sf0t876 sd0ty76 sdy76 sdf76 sdf0t76 sdf0ty9 sdf0ty98 sdf0ty987 sdf0ty98 sdf6676 sdf876 sd876 sd876 sdf6 sdf6 sdf9876 sdf0t sdf06 sdf0ty9776 sdf0ty9776 sdf0ty76 sdf8876 sdf0t sd6 sdf06 s688876 sd688 sdf86