Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new analyzer and tokenizer ik_max_word_char #854

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,25 @@ IK Analysis for Elasticsearch

The IK Analysis plugin integrates Lucene IK analyzer (http://code.google.com/p/ik-analyzer/) into elasticsearch, support customized dictionary.

Analyzer: `ik_smart` , `ik_max_word` , Tokenizer: `ik_smart` , `ik_max_word`
Analyzer: `ik_smart` , `ik_max_word` , `ik_max_word_char` Tokenizer: `ik_smart` , `ik_max_word` , `ik_max_word_char`

Versions
--------

IK version | ES version
-----------|-----------
master | 7.x -> master
6.x| 6.x
5.x| 5.x
1.10.6 | 2.4.6
1.9.5 | 2.3.5
1.8.1 | 2.2.1
1.7.0 | 2.1.1
1.5.0 | 2.0.0
1.2.6 | 1.0.0
1.2.5 | 0.90.x
1.1.3 | 0.20.x
1.0.0 | 0.16.2 -> 0.19.0
| IK version | ES version |
| ---------- | ---------------- |
| master | 7.x -> master |
| 6.x | 6.x |
| 5.x | 5.x |
| 1.10.6 | 2.4.6 |
| 1.9.5 | 2.3.5 |
| 1.8.1 | 2.2.1 |
| 1.7.0 | 2.1.1 |
| 1.5.0 | 2.0.0 |
| 1.2.6 | 1.0.0 |
| 1.2.5 | 0.90.x |
| 1.1.3 | 0.20.x |
| 1.0.0 | 0.16.2 -> 0.19.0 |

Install
-------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@
public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer> {
private final IKAnalyzer analyzer;

public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, boolean useSmart) {
this(indexSettings, env, name, settings, useSmart, false);
}

public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart, boolean includeSingleChar) {
super(indexSettings, name, settings);

Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);
Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart).setIncludeSingleChar(includeSingleChar);

analyzer=new IKAnalyzer(configuration);
}
Expand All @@ -25,6 +29,10 @@ public static IkAnalyzerProvider getIkAnalyzerProvider(IndexSettings indexSettin
return new IkAnalyzerProvider(indexSettings,env,name,settings,false);
}

public static IkAnalyzerProvider getIkIncludeCharAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
return new IkAnalyzerProvider(indexSettings,env,name,settings,false, true);
}

@Override public IKAnalyzer get() {
return this.analyzer;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ public static IkTokenizerFactory getIkTokenizerFactory(IndexSettings indexSettin
return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(false);
}

public static IkTokenizerFactory getIkIncludeCharTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(false).setIncludeSingleChar(true);
}

public static IkTokenizerFactory getIkSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(true);
}
Expand All @@ -28,6 +32,11 @@ public IkTokenizerFactory setSmart(boolean smart){
return this;
}

public IkTokenizerFactory setIncludeSingleChar(boolean singleChar) {
this.configuration.setIncludeSingleChar(singleChar);
return this;
}

@Override
public Tokenizer create() {
return new IKTokenizer(configuration); }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokeniz

extra.put("ik_smart", IkTokenizerFactory::getIkSmartTokenizerFactory);
extra.put("ik_max_word", IkTokenizerFactory::getIkTokenizerFactory);
extra.put("ik_max_word_char", IkTokenizerFactory::getIkIncludeCharTokenizerFactory);

return extra;
}
Expand All @@ -34,6 +35,7 @@ public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends An

extra.put("ik_smart", IkAnalyzerProvider::getIkSmartAnalyzerProvider);
extra.put("ik_max_word", IkAnalyzerProvider::getIkAnalyzerProvider);
extra.put("ik_max_word_char", IkAnalyzerProvider::getIkIncludeCharAnalyzerProvider);

return extra;
}
Expand Down
12 changes: 12 additions & 0 deletions src/main/java/org/wltea/analyzer/cfg/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ public class Configuration {
//是否启用小写处理
private boolean enableLowercase=true;

//是否包含单个字符
private boolean includeSingleChar=false;

@Inject
public Configuration(Environment env,Settings settings) {
Expand All @@ -36,6 +38,7 @@ public Configuration(Environment env,Settings settings) {
this.useSmart = settings.get("use_smart", "false").equals("true");
this.enableLowercase = settings.get("enable_lowercase", "true").equals("true");
this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
this.includeSingleChar = settings.get("include_single_char", "false").equals("true");

Dictionary.initial(this);

Expand Down Expand Up @@ -72,4 +75,13 @@ public boolean isEnableRemoteDict() {
public boolean isEnableLowercase() {
return enableLowercase;
}

public boolean isIncludeSingleChar() {
return includeSingleChar;
}

public Configuration setIncludeSingleChar(boolean includeSingleChar) {
this.includeSingleChar = includeSingleChar;
return this;
}
}
24 changes: 14 additions & 10 deletions src/main/java/org/wltea/analyzer/core/CJKSegmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,14 @@ class CJKSegmenter implements ISegmenter {
static final String SEGMENTER_NAME = "CJK_SEGMENTER";
//待处理的分词hit队列
private List<Hit> tmpHits;

//是否包含单字分词
private boolean includeSingleChar=false;


CJKSegmenter(){
CJKSegmenter(boolean singleChar){
includeSingleChar=singleChar;

this.tmpHits = new LinkedList<Hit>();
}

Expand Down Expand Up @@ -78,21 +83,20 @@ public void analyze(AnalyzeContext context) {
//*********************************
//再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
if(singleCharHit.isMatch()) {//首字成词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
} else if (includeSingleChar) {//单字拆词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNCHAR);
context.addLexeme(newLexeme);
}

//同时也是词前缀
if(singleCharHit.isPrefix()){
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
}else if(singleCharHit.isPrefix()){//首字为词前缀
// 判断词前缀
if(singleCharHit.isPrefix()){
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}


}else{
//遇到CHAR_USELESS字符
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/wltea/analyzer/core/IKSegmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ private List<ISegmenter> loadSegmenters(){
//处理中文数量词的子分词器
segmenters.add(new CN_QuantifierSegmenter());
//处理中文词的子分词器
segmenters.add(new CJKSegmenter());
segmenters.add(new CJKSegmenter(configuration.isIncludeSingleChar()));
return segmenters;
}

Expand Down