Skip to content

Commit 6a487af

Browse files
committed
update nlp
1 parent 69af869 commit 6a487af

File tree

9 files changed

+202
-43
lines changed

9 files changed

+202
-43
lines changed

nlp/dic/defined.dic

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
知识图谱
33
吖啶基氨酸基本基甲烷磺酰甲氧基苯胺
44
进展
5-
有没有进展
5+
有没有进展
6+
复联终章快上映了好激动

nlp/nlp.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# 自定义词典位置;动态增加的词对应词库也需要配置在该位置
1+
# 自定义词典位置;动态增加的词对应词库也需要配置在该位置[Hnlp/Thunlp/IK都可生效]
22
defined.dic.path=nlp/dic/defined.dic
33
# 停止词词库位置
44
stopword.dic.path=nlp/dic/stopword.dic;nlp/dic/stopword.utf8

src/main/java/data/lab/ongdb/index/FreetextIK.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,18 @@ public List<String> iKAnalyzer(@Name("text") String text, @Name("useSmart") bool
5050
return results;
5151
}
5252

53+
/**
54+
* @param text:待分词文本
55+
* @param useSmart:true 用智能分词,false 细粒度分词
56+
* @return
57+
* @Description: TODO(支持中英文本分词)
58+
*/
59+
@UserFunction(name = "olab.nlp.ik")
60+
@Description("IK分词器 - RETURN olab.nlp.ik({text},true) AS words")
61+
public List<String> nlpIk(@Name("text") String text, @Name(value = "useSmart",defaultValue = "true") boolean useSmart) {
62+
return iKAnalyzer(text, useSmart);
63+
}
64+
5365
}
5466

67+

src/main/java/data/lab/ongdb/inferencing/Inference.java

Lines changed: 112 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,14 @@
1616
import data.lab.ongdb.result.MapResult;
1717
import data.lab.ongdb.schema.auto.AutoCypher;
1818
import data.lab.ongdb.structure.AdjacencyNode;
19+
import data.lab.ongdb.util.ArrayUtils;
1920
import data.lab.ongdb.util.CypherBuilder;
2021
import data.lab.ongdb.util.FileUtil;
2122
import data.lab.wltea.analyzer.cfg.Configuration;
2223
import data.lab.wltea.analyzer.core.IKSegmenter;
2324
import data.lab.wltea.analyzer.core.Lexeme;
2425
import org.neo4j.graphdb.GraphDatabaseService;
25-
import org.neo4j.procedure.Context;
26-
import org.neo4j.procedure.Description;
27-
import org.neo4j.procedure.Name;
28-
import org.neo4j.procedure.UserFunction;
26+
import org.neo4j.procedure.*;
2927

3028
import java.io.IOException;
3129
import java.io.StringReader;
@@ -37,7 +35,7 @@
3735
import java.util.stream.Stream;
3836

3937
/**
40-
* 基于图数据的智能搜索
38+
* 基于图数据上下文语义的智能搜索
4139
*
4240
* @author Yc-Ma
4341
* @PACKAGE_NAME: data.lab.ongdb.inferencing
@@ -72,13 +70,13 @@ public class Inference implements SemanticInter {
7270
@Description("RETURN olab.entity.recognition({graphDataSchema},{nodeHitsRules},{askString},{operator}) AS entityRecognitionHits")
7371
public Map<String, Object> entityRecognition(@Name("graphDataSchema") String graphDataSchema, @Name("nodeHitsRules") String nodeHitsRules, @Name("askString") String askString, @Name("operator") String operator, @Name(value = "words", defaultValue = "[]") List<String> words) throws IOException {
7472
if (words != null && !words.isEmpty()) {
75-
return executeEntityRecognition(graphDataSchema, nodeHitsRules, operator,words);
73+
return executeEntityRecognition(graphDataSchema, nodeHitsRules, operator, words);
7674
} else {
7775
/*
7876
* 获取分词结果
7977
* */
8078
words = getSemanticSmartIKSegmenter(askString);
81-
return executeEntityRecognition(graphDataSchema, nodeHitsRules, operator,words);
79+
return executeEntityRecognition(graphDataSchema, nodeHitsRules, operator, words);
8280
}
8381
}
8482

@@ -763,6 +761,113 @@ private List<Map<String, Object>> executeIntentSchemaParse(String graphDataSchem
763761
}).collect(Collectors.toList());
764762
}
765763

764+
/**
765+
* @param graphDataSchemaStr:图数据模型定义【schema主要定义标签和标签之间的关联类型】
766+
* @param entityRecognitionHits :实体识别结果{semantic_schema}
767+
* @param inferenceWeightStr :搜索本体的权重配置
768+
* @return entityRecognitionHits:排序实体识别结果{semantic_schema}
769+
* @Description: TODO(根据实体识别结果和本体权重配置 , 对实体对进行排列组合 【 权重搜索队列 】 过程)
770+
* //inferenceWeight格式【暂只支持LABEL配置】
771+
* {
772+
* "LABEL":
773+
* {
774+
* "label1": "weight",
775+
* "label2": "weight"
776+
* }
777+
* }
778+
*/
779+
@Override
780+
@Procedure(name = "olab.entity.ptmd.queue")
781+
@Description("RETURN olab.entity.ptmd.queue({graphDataSchemaStr},{entityRecognitionHits},{inferenceWeightStr}) AS entityRecognitionHits")
782+
public Stream<MapResult> entityPtmdQueue(@Name("graphDataSchemaStr") String graphDataSchemaStr, @Name("entityRecognitionHits") Map<String, Object> entityRecognitionHits, @Name("inferenceWeightStr") String inferenceWeightStr) {
783+
784+
Map<String, Object> graphDataSchema = JSONObject.parseObject(graphDataSchemaStr);
785+
786+
List<Map<String, Object>> nodes = getGraphObject(graphDataSchema, "nodes");
787+
//本体模型中所有节点标签,使用entities时node中labels需要和labels求交集
788+
List<String> labels = nodes.stream().map(v -> {
789+
List<String> list = (List<String>) v.get("labels");
790+
return list.get(0);
791+
}).collect(Collectors.toList());
792+
793+
Map<String, List<Map<String, Object>>> entities = (Map<String, List<Map<String, Object>>>) entityRecognitionHits.get("entities");
794+
795+
//将词填充到hits列表的每个对象
796+
Set<String> words = entities.keySet();
797+
for (String word : words) {
798+
List<Map<String, Object>> list = entities.get(word);
799+
List<Map<String, Object>> listReset = list.stream().peek(v -> v.put("word", word)).collect(Collectors.toList());
800+
entities.put(word, listReset);
801+
}
802+
803+
//一、不移除词,直接求笛卡尔积
804+
List<List<Map<String, Object>>> descartesL = new ArrayList<>(new ArrayUtils().descartesStringKey(entities));
805+
806+
//二、循环移除一个词,求笛卡尔积
807+
List<String> arrWords = Arrays.asList(words.toArray(new String[0]));
808+
int size = arrWords.size();
809+
for (int i = size - 1; i >= 0; i--) {
810+
//移除一个词
811+
Map<String, List<Map<String, Object>>> entitiesTemp = new HashMap<>(entities);
812+
entitiesTemp.remove(arrWords.get(i));
813+
//求笛卡尔积
814+
List<List<Map<String, Object>>> descartesList = new ArrayUtils().descartesStringKey(entitiesTemp);
815+
descartesL.addAll(descartesList);
816+
}
817+
818+
//三、循环移除两个词,求笛卡尔积
819+
// List<String> arrWords = Arrays.asList(words.toArray(new String[0]));
820+
for (int i = size - 1; i >= 0; i--) {
821+
//移除两个词
822+
Map<String, List<Map<String, Object>>> entitiesTemp = new HashMap<>(entities);
823+
entitiesTemp.remove(arrWords.get(i));
824+
if (i > 0) {
825+
entitiesTemp.remove(arrWords.get(i - 1));
826+
}
827+
//求笛卡尔积
828+
List<List<Map<String, Object>>> descartesList = new ArrayUtils().descartesStringKey(entitiesTemp);
829+
descartesL.addAll(descartesList);
830+
}
831+
832+
//笛卡尔积,组合列表,按照权重排序
833+
List<List<Map<String, Object>>> descartesLSort = descartesLSort(descartesL, inferenceWeightStr, labels);
834+
835+
List<MapResult> entityRecognitionHitsList = new ArrayList<>();
836+
for (List<Map<String, Object>> list : descartesLSort) {
837+
Map<String, List<Map<String, Object>>> entitiesMap = new HashMap<>();
838+
for (Map<String, Object> map : list) {
839+
String word = String.valueOf(map.get("word"));
840+
entitiesMap.put(word, new ArrayList<Map<String, Object>>() {{
841+
add(map);
842+
}});
843+
}
844+
entityRecognitionHitsList.add(new MapResult(new HashMap<String, Object>() {{
845+
put("entities", entitiesMap);
846+
}}));
847+
}
848+
return entityRecognitionHitsList.stream();
849+
}
850+
851+
private List<List<Map<String, Object>>> descartesLSort(List<List<Map<String, Object>>> descartesL, String inferenceWeightStr, List<String> labels) {
852+
JSONObject inferenceWeightObj = JSONObject.parseObject(inferenceWeightStr);
853+
JSONObject labelsWeight = inferenceWeightObj.getJSONObject("LABEL");
854+
return descartesL.stream().sorted((v1, v2) -> {
855+
//计算weight
856+
Integer v1f = v1.stream().map(v -> getWeight(labelsWeight, v, labels)).reduce((x, y) -> x += y).get();
857+
Integer v2f = v2.stream().map(v -> getWeight(labelsWeight, v, labels)).reduce((x, y) -> x += y).get();
858+
Integer v1Score = v1.size() + v1f;
859+
Integer v2Score = v2.size() + v2f;
860+
return v2Score.compareTo(v1Score);
861+
}).collect(Collectors.toList());
862+
}
863+
864+
private int getWeight(JSONObject labelsWeight, Map<String, Object> v, List<String> labels) {
865+
List<String> labelsRe = new ArrayList<>(labels);
866+
labelsRe.retainAll((List<String>) v.get("labels"));
867+
Integer wei = labelsWeight.getInteger(labelsRe.size() > 0 ? labelsRe.get(0) : null);
868+
return wei == null ? 0 : wei;
869+
}
870+
766871
/**
767872
* 执行QUERY
768873
**/

src/main/java/data/lab/ongdb/inferencing/SemanticInter.java

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@
55
*
66
*/
77

8+
import data.lab.ongdb.result.MapResult;
9+
810
import java.io.IOException;
911
import java.util.List;
1012
import java.util.Map;
13+
import java.util.stream.Stream;
1114

1215
/**
13-
* 基于图数据的智能搜索接口定义
16+
* 基于图数据上下文语义的智能搜索接口定义
1417
*
1518
* @author Yc-Ma
1619
* @PACKAGE_NAME: data.lab.ongdb.inferencing
@@ -162,7 +165,7 @@ public interface SemanticInter {
162165
* @return 实体识别结果 关键词与hits列表 {entityRecognitionHits} 【包含node、label、relationship、relationshipType四个类型】
163166
* @Description: TODO
164167
*/
165-
Map<String, Object> entityRecognition(String graphDataSchema, String nodeHitsRules, String askString, String operator,List<String> words) throws IOException;
168+
Map<String, Object> entityRecognition(String graphDataSchema, String nodeHitsRules, String askString, String operator, List<String> words) throws IOException;
166169

167170
/*
168171
* intentSchema入参格式样例:
@@ -269,30 +272,47 @@ public interface SemanticInter {
269272
* @param graphDataSchema:图数据Schema
270273
* @param query:查询语句【默认使用IK分词】
271274
* @param words:query分词结果【如果传入不为空的结果,则query参数失效】
272-
* @param intendedIntentStr:预期意图
273-
* [
274-
* {
275-
* "label": "LABEL1",
276-
* "sort":2, //意图标签返回优先级
277-
* "list": //意图模式识别可用关键词
278-
* [
279-
* ]
280-
* },
281-
* {
282-
* "label": "LABEL2",
283-
* "sort":1, //意图标签返回优先级
284-
* "list": //意图模式识别可用关键词
285-
* [
286-
* ]
287-
* }
288-
* ]
275+
* @param intendedIntentStr:预期意图 [
276+
* {
277+
* "label": "LABEL1",
278+
* "sort":2, //意图标签返回优先级
279+
* "list": //意图模式识别可用关键词
280+
* [
281+
* ]
282+
* },
283+
* {
284+
* "label": "LABEL2",
285+
* "sort":1, //意图标签返回优先级
286+
* "list": //意图模式识别可用关键词
287+
* [
288+
* ]
289+
* }
290+
* ]
289291
* @return
290-
* @Description: TODO(意图识别模式匹配方法)
292+
* @Description: TODO(意图识别模式匹配方法)
291293
*/
292-
List<Map<String, Object>> intentSchemaParse(String graphDataSchema, String query,List<String> words, String intendedIntentStr) throws IOException;
294+
List<Map<String, Object>> intentSchemaParse(String graphDataSchema, String query, List<String> words, String intendedIntentStr) throws IOException;
295+
296+
/**
297+
* @param graphDataSchemaStr:图数据模型定义【schema主要定义标签和标签之间的关联类型】
298+
* @param entityRecognitionHits:实体识别结果{semantic_schema}
299+
* @param inferenceWeightStr:搜索本体的权重配置
300+
* @return entityRecognitionHits:排序实体识别结果{semantic_schema}
301+
* @Description: TODO(根据实体识别结果和本体权重配置,对实体对进行排列组合【权重搜索队列】过程)
302+
* //inferenceWeight格式【暂只支持LABEL配置】
303+
* {
304+
* "LABEL":
305+
* {
306+
* "label1": "weight",
307+
* "label2": "weight"
308+
* }
309+
* }
310+
*/
311+
Stream<MapResult> entityPtmdQueue(String graphDataSchemaStr, Map<String, Object> entityRecognitionHits, String inferenceWeightStr);
293312

294313
}
295314

296315

297316

298317

318+

src/main/java/data/lab/ongdb/procedures/FunctionPartition.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,17 @@ public List<List<Map<String, Object>>> cartesian(@Name("mapList") List<Map<Strin
104104
@Name("groupField") String groupField) {
105105
return new ArrayUtils().descartes(mapList, groupField);
106106
}
107+
108+
/**
109+
* @param modelMap:【传入分好组的MAP】
110+
* @return
111+
* @Description: TODO(笛卡尔乘积算法 【 对列表中实体进行笛卡尔乘积运算进行组合 】)
112+
*/
113+
@UserFunction(name = "olab.cartesian.byGroupMap")
114+
@Description("笛卡尔乘积算法 【对列表中实体使用指定字段进行分组,并进行笛卡尔乘积运算进行组合】")
115+
public List<List<Map<String, Object>>> cartesian(@Name("modelMap") Map<String, List<Map<String, Object>>> modelMap) {
116+
return new ArrayUtils().descartesStringKey(modelMap);
117+
}
107118
}
108119

109120

src/main/java/data/lab/ongdb/util/ArrayUtils.java

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,19 @@ public class ArrayUtils {
2525
* @Description: TODO(笛卡尔乘积算法 【进行笛卡尔乘积运算进行组合】)
2626
*/
2727
public List<List<Map<String, Object>>> descartes(Map<Object, List<Map<String, Object>>> modelMap) {
28-
/*
29-
* 按指定字段(type)分组
30-
* */
31-
Collection<List<Map<String, Object>>> mapValues = modelMap.values();
32-
33-
/*
34-
* 原List
35-
* */
36-
List<List<Map<String, Object>>> dimensionValue = new ArrayList<>(mapValues);
28+
List<List<Map<String, Object>>> result = new ArrayList<>();
29+
new ArrayUtils().descartes(new ArrayList<>(modelMap.values()), result, 0, new ArrayList<>());
30+
return result;
31+
}
3732

38-
/*
39-
* 返回集合
40-
* */
33+
/**
34+
* @param modelMap:【传入分好组的MAP】
35+
* @return
36+
* @Description: TODO(笛卡尔乘积算法 【进行笛卡尔乘积运算进行组合】)
37+
*/
38+
public List<List<Map<String, Object>>> descartesStringKey(Map<String, List<Map<String, Object>>> modelMap) {
4139
List<List<Map<String, Object>>> result = new ArrayList<>();
42-
new ArrayUtils().descartes(dimensionValue, result, 0, new ArrayList<>());
40+
new ArrayUtils().descartes(new ArrayList<>(modelMap.values()), result, 0, new ArrayList<>());
4341
return result;
4442
}
4543

src/main/java/data/lab/wltea/analyzer/dic/Dictionary.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
*/
2424
package data.lab.wltea.analyzer.dic;
2525

26+
import data.lab.ongdb.algo.nlp.NLPConfiguration;
2627
import data.lab.wltea.analyzer.cfg.Configuration;
2728
import org.apache.http.Header;
2829
import org.apache.http.HttpEntity;
@@ -248,6 +249,8 @@ private List<String> getExtDictionarys() {
248249
}
249250
}
250251
}
252+
List<String> nlpUserDic =NLPConfiguration.USER_DICT_LIST;
253+
extDictFiles.addAll(nlpUserDic);
251254
return extDictFiles;
252255
}
253256

wiki/3.5.x/OLAB-README-v-3.5.x.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ olab.moveDecimalPoint
4545
### 中文分词 *-true 智能分词,false 细粒度分词
4646
```cql
4747
RETURN olab.index.iKAnalyzer('复联终章快上映了好激动,据说知识图谱与人工智能技术应用到了那部电影!吖啶基氨基甲烷磺酰甲氧基苯胺是一种药嘛?',true) AS words
48+
RETURN olab.nlp.ik('复联终章快上映了好激动,据说知识图谱与人工智能技术应用到了那部电影!吖啶基氨基甲烷磺酰甲氧基苯胺是一种药嘛?') AS words
4849
```
4950
- 组合切词结果后进行查询
5051
```cql
@@ -707,13 +708,20 @@ UNWIND rels AS r
707708
CALL olab.schema.loop.vpath.ind(r,-1,vFMap) YIELD from,rel,to RETURN (from)-[rel]->(to) AS path,vFMap,graph,uniqueGraphID
708709
```
709710

711+
## 获取MAP的key列表
712+
```
713+
RETURN olab.map.keys({id:1,name:'a',type:1})
714+
```
715+
710716
## 使用笛卡尔乘积算法
711717
```
712718
// 使用笛卡尔乘积算法 【对列表中实体使用指定字段进行分组,并进行笛卡尔乘积运算进行组合】
713719
// {mapList}:原List
714720
// {groupField}:列表中对象的分组字段
715721
RETURN olab.cartesian({mapList},{groupField}) AS cartesianList
716722
RETURN olab.cartesian([{id:1,name:'a',type:1},{id:2,name:'b',type:1},{id:3,name:'c',type:1},{id:4,name:'d',type:2},{id:5,name:'e',type:3},{id:6,name:'f',type:3}],'type') AS cartesianList
723+
RETURN olab.cartesian.byGroupMap({`1`:[1,2],`2`:[1,2,3],`3`:[4,5,6]})
724+
RETURN olab.cartesian.byGroupMap({a1:[1,2],a2:[1,2,3],a3:[4,5,6]})
717725
```
718726

719727
## 提取图结构并以图搜图将结果转换为虚拟图

0 commit comments

Comments
 (0)