Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support multiple etymologies #893

Open
wants to merge 55 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
ec62312
reformat code
qicz Jul 7, 2021
cf705e2
optimize code
qicz Jul 8, 2021
64c43a7
crlf to lf
qicz Jul 8, 2021
fd920fc
update config to yml
qicz Jul 9, 2021
a843449
update grant
qicz Jul 9, 2021
8d39443
update yml read logic
qicz Jul 9, 2021
f730a79
optimize code
qicz Jul 10, 2021
0d42c2d
optimize configuration
qicz Jul 10, 2021
df31678
optimize configuration properties
qicz Jul 10, 2021
ed59f32
refactor configuration properties
qicz Jul 11, 2021
3fc5545
refactor configuration properties
qicz Jul 11, 2021
497adf5
rename reload dict
qicz Jul 11, 2021
e61efef
add mysql and redis config
qicz Jul 11, 2021
5a62761
add remote dictionary logic
qicz Jul 11, 2021
222c606
export properties
qicz Jul 11, 2021
a2f698e
optimize dictionary & helper
qicz Jul 11, 2021
bc019df
refactor remote dictionary
qicz Jul 11, 2021
cffb603
fix remote dictionary inital logic
qicz Jul 11, 2021
8a9569a
typo
qicz Jul 11, 2021
2f42e53
optimize dictionary
qicz Jul 11, 2021
9ac1906
mysql remote dictionary implementation
qicz Jul 12, 2021
cca6af1
redis remote dictionary implementation
qicz Jul 12, 2021
c6f7843
fix http schema remote dictionary bug & optimize redis remote dictionary
qicz Jul 12, 2021
baee3e6
update readme
qicz Jul 12, 2021
a64a303
update readme
qicz Jul 12, 2021
b04baad
refactor dictionary logic
qicz Jul 12, 2021
4879120
typo
qicz Jul 12, 2021
b26cd88
optimize configuration
qicz Jul 12, 2021
922e77d
refactor remote dictionary schema logic
qicz Jul 12, 2021
f141420
support custom domain and etymology
qicz Jul 13, 2021
b3f364d
refactor dictionary logic
qicz Jul 13, 2021
4135faa
update readme
qicz Jul 13, 2021
3833fab
typo
qicz Jul 13, 2021
dc612cf
format readme
qicz Jul 13, 2021
6d7187c
fix default-domain logic
qicz Jul 13, 2021
2d6a813
check etymology
qicz Jul 13, 2021
7717197
format log
qicz Jul 13, 2021
8a5b208
refactor mysql remote dictionary logic
qicz Jul 14, 2021
334c670
link starter
qicz Jul 14, 2021
60d5812
import redip
qicz Jul 15, 2021
b4fc23d
update log
qicz Jul 15, 2021
0a080d1
using released redip
qicz Jul 15, 2021
78ac658
using AssertKit
qicz Jul 15, 2021
03e94df
update redip
qicz Jul 16, 2021
fd42a35
optimize dictionary reload logic
qicz Jul 16, 2021
42cd120
update testing
qicz Jul 16, 2021
de55714
testing & using redip 1.0.2, redis remote dictionary using zset store…
qicz Jul 16, 2021
e7961da
using enableMonitor function variable
qicz Jul 16, 2021
c00362f
update test logging
qicz Jul 16, 2021
a0b8d17
update redip1.0.3: add remote dictionary shutdown hook, close the res…
qicz Jul 19, 2021
bb4b9b2
adaptive ik xml configuration
qicz Jul 22, 2021
ad6a830
optimize import
qicz Jul 22, 2021
c87f744
update readme
qicz Jul 22, 2021
8c51480
fix readme bug
qicz Jul 22, 2021
bae22f7
support redis cluster
qicz Aug 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
support custom domain and etymology
  • Loading branch information
qicz committed Jul 13, 2021
commit f14142004944f6bbe03748d5bacece043ad04255
107 changes: 84 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,38 +20,99 @@ dict: # 扩展词库配置
stop: # 本地stop词典扩展词典文件
- extra_stopword.dic
remote: # 远程扩展词典配置
# schema: http or redis or mysql
# redis://words-key, eg: redis://ik-main-words or redis://ik-stop-words
# mysql://tableName, eg: mysql://ik_main_words or mysql://ik_stop_words
main: # 远程主词典扩展词典文件
- # http://....
- # redis://
- # mysql://
stop: # 远程stop词典扩展词典文件
- # http://....
- # redis://
- # mysql://
http:
# http 服务地址
# main-words path: ${base}/es-dict/main-words/{domain}
# stop-words path: ${base}/es-dict/stop-words/{domain}
base: http://localhost
redis:
# main-words key: es-ik-words:{domain}:main-words
# stop-words key: es-ik-words:{domain}:stop-words
host: localhost
port: 6379
database: 0
username:
password:
mysql:
url: jdbc:mysql://127.0.0.1/ik-db?useSSL=false&serverTimezone=GMT%2B8
username: root
password: dbadmin
refresh: # 刷新配置
delay: 10 # 延迟时间,单位s
period: 60 # 周期时间,单位s

mysql:
url: jdbc:mysql://127.0.0.1/ik-db?useSSL=false&serverTimezone=GMT%2B8
username: root
password: dbadmin

redis:
host: localhost
port: 6379
database: 0
username:
password:
```

- 调整优化重构Dictionary实现;
- 支持根据不同的业务指定远程动态词源

```bash
PUT es-ik-index
{
"settings": {
"analysis.analyzer": {
"ik_smart": {
"type":"ik_smart"
"enable_remote_dict": true,
"domain": "order", # 业务领域
"etymology": "redis" # 词源,可取值:redis,http,mysql,默认为redis
}
}
},
"mappings": {
"_doc": {
"properties": {
"field1": {
"type": "text",
"analyzer": "ik_smart"
}
}
}
}
}
```



- 修复和重构Http扩展词提供方式的bug;
- 扩展RemoteDictionary,提供可配置的基于MySQL、Redis的扩展词库更新方式;

```sql
/*
@author Qicz

Date: 13/07/2021 10:18:19
*/

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for ik_sequence
-- ----------------------------
DROP TABLE IF EXISTS `ik_sequence`;
CREATE TABLE `ik_sequence` (
`current_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`domain` varchar(100) NOT NULL COMMENT '所属领域',
PRIMARY KEY (`current_id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

-- ----------------------------
-- Table structure for ik_words
-- ----------------------------
DROP TABLE IF EXISTS `ik_words`;
CREATE TABLE `ik_words` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`word` varchar(200) NOT NULL,
`word_type` tinyint(4) unsigned NOT NULL COMMENT 'word类型,1主词库,2stop词库',
`domain` varchar(100) NOT NULL COMMENT '所属领域',
`create_time` datetime NOT NULL COMMENT '创建时间',
PRIMARY KEY (`id`),
UNIQUE KEY `domain_word` (`word`,`domain`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

SET FOREIGN_KEY_CHECKS = 1;
```

> `jre/lib/security/java.policy`的grant中加入 `permission java.security.AllPermission;`

#### TODO by Qicz
Expand Down
32 changes: 12 additions & 20 deletions config/ik-db.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
@author Qicz
Date: 12/07/2021 13:16:34

Date: 13/07/2021 10:18:19
*/

SET NAMES utf8mb4;
Expand All @@ -11,32 +12,23 @@ SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
DROP TABLE IF EXISTS `ik_sequence`;
CREATE TABLE `ik_sequence` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`dictionary` varchar(100) NOT NULL,
`current_id` int(10) unsigned NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

-- ----------------------------
-- Table structure for ik_stop_words
-- ----------------------------
DROP TABLE IF EXISTS `ik_stop_words`;
CREATE TABLE `ik_stop_words` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`word` varchar(200) NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `word` (`word`)
`current_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`domain` varchar(100) NOT NULL COMMENT '所属领域',
PRIMARY KEY (`current_id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

-- ----------------------------
-- Table structure for ik_main_words
-- Table structure for ik_words
-- ----------------------------
DROP TABLE IF EXISTS `ik_main_words`;
CREATE TABLE `ik_main_words` (
DROP TABLE IF EXISTS `ik_words`;
CREATE TABLE `ik_words` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`word` varchar(200) NOT NULL,
`word_type` tinyint(4) unsigned NOT NULL COMMENT 'word类型,1主词库,2stop词库',
`domain` varchar(100) NOT NULL COMMENT '所属领域',
`create_time` datetime NOT NULL COMMENT '创建时间',
PRIMARY KEY (`id`),
UNIQUE KEY `word` (`word`)
UNIQUE KEY `domain_word` (`word`,`domain`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

SET FOREIGN_KEY_CHECKS = 1;
39 changes: 17 additions & 22 deletions config/ikanalyzer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,24 @@ dict: # 扩展词库配置
stop: # 本地stop词典扩展词典文件
- extra_stopword.dic
remote: # 远程扩展词典配置
# schema: http or redis or mysql
# redis://words-key, eg: redis://ik-main-words or redis://ik-stop-words
# mysql://tableName, eg: mysql://ik_main_words or mysql://ik_stop_words
main: # 远程主词典扩展词典文件
- # http://....
- # redis://
- # mysql://
stop: # 远程stop词典扩展词典文件
- # http://....
- # redis://
- # mysql://
http:
# http 服务地址
# main-words path: ${base}/es-dict/main-words/{domain}
# stop-words path: ${base}/es-dict/stop-words/{domain}
base: http://localhost
redis:
# main-words key: es-ik-words:{domain}:main-words
# stop-words key: es-ik-words:{domain}:stop-words
host: localhost
port: 6379
database: 0
username:
password:
mysql:
url: jdbc:mysql://127.0.0.1/ik-db?useSSL=false&serverTimezone=GMT%2B8
username: root
password: dbadmin
refresh: # 刷新配置
delay: 10 # 延迟时间,单位s
period: 60 # 周期时间,单位s

mysql:
url: jdbc:mysql://127.0.0.1/ik-db?useSSL=false&serverTimezone=GMT%2B8
username: root
password: dbadmin

redis:
host: localhost
port: 6379
database: 0
username:
password:
22 changes: 19 additions & 3 deletions src/main/java/org/wltea/analyzer/configuration/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.wltea.analyzer.dictionary.DefaultDictionary;
import org.wltea.analyzer.dictionary.Dictionary;
import org.wltea.analyzer.dictionary.remote.RemoteDictionary;
import org.wltea.analyzer.dictionary.remote.RemoteDictionaryEtymology;
import org.wltea.analyzer.help.ESPluginLoggerFactory;
import org.yaml.snakeyaml.Yaml;
import org.yaml.snakeyaml.constructor.CustomClassLoaderConstructor;
Expand All @@ -21,9 +22,11 @@
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.net.URI;
import java.nio.file.Path;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.Objects;

public class Configuration {

Expand All @@ -39,10 +42,12 @@ public class Configuration {
private boolean enableLowercase;

private final static String IKANALYZER_YML = "ikanalyzer.yml";
private final String DEFAULT_DOMAIN = "default-domain";

private static Boolean initialed = false;
private static ConfigurationProperties properties;
private static String dictRootPath;
private static Dictionary DEFAULT_DOMAIN_DICTIONARY;
private Dictionary dictionary;

@Inject
Expand All @@ -51,17 +56,28 @@ public Configuration(Environment env, Settings settings) {
this.useSmart = "true".equals(settings.get("use_smart", "false"));
this.enableLowercase = "true".equals(settings.get("enable_lowercase", "true"));
this.enableRemoteDict = "true".equals(settings.get("enable_remote_dict", "true"));
String domain = settings.get("domain", "default-domain");
// 词源
String etymology = settings.get("etymology", RemoteDictionaryEtymology.DEFAULT.getEtymology());
// 领域
String domain = settings.get("domain", DEFAULT_DOMAIN);
// 配置初始化
Configuration.initial(env);
// 初始化默认词库
DefaultDictionary defaultDictionary = DefaultDictionary.initial(properties);
this.dictionary = Dictionary.initial(this, defaultDictionary, domain);
// 构造词源及领域
URI domainUri = URI.create(String.format("%s://%s", etymology, domain));
if (DEFAULT_DOMAIN.equals(domain)) {
if (Objects.isNull(DEFAULT_DOMAIN_DICTIONARY)) {
DEFAULT_DOMAIN_DICTIONARY = Dictionary.initial(this, defaultDictionary, domainUri);
}
this.dictionary = DEFAULT_DOMAIN_DICTIONARY;
} else {
this.dictionary = Dictionary.initial(this, defaultDictionary, domainUri);
}
}

private synchronized static void initial(Environment env) {
if (Configuration.initialed) {
logger.info("the properties is initialed");
return;
}
// 加载配置文件
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ public static class Dict {
@Data
public static class Remote {

String http;
/**
* http 配置
*/
Http http;

/**
* mysql 配置
Expand Down Expand Up @@ -90,6 +93,11 @@ public static class DictFile {
List<String> stop = Collections.emptyList();
}

@Data
public static class Http {
String base = "http://localhost";
}

@Data
public static class MySQL {

Expand Down
Loading