Skip to content

Commit 9cb8916

Browse files
committed
add suggestion feature.
1 parent 5794794 commit 9cb8916

File tree

3 files changed

+154
-34
lines changed

3 files changed

+154
-34
lines changed

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
- 解决命中不全BUG
1414
- 3.1
1515
- 增加词频统计
16+
- 3.5
17+
- 清除词频统计 [没有什么意义]
18+
- 增加Suggestion特性 根据某个word提取相关的词语
1619

1720
## 注意
1821
- 在即时场景中(即时更新关键词),如果关键词数量较大,到十万甚至百万级别,尽量不要使用CGI模式,首次加载需要较大的性能开销,多个进程同时使用会造成一定的内存浪费,整体性能会下降,会拖垮web服务。这种情况下建议使用swoole单独封装服务,目前十万级别的关键词,已经在生产环境中验证过并运行良好。
@@ -47,11 +50,39 @@ var_dump($res);
4750
$res = $tree->delete("张三");
4851
//删除整棵树 连带“张三”和张三下的“张三四”一并删除
4952
$tree->delete("张三",true);
53+
54+
55+
56+
//拼音检测
57+
$tree->append("zhangsan","",true,"张三");
58+
$tree->append("zhangsan","",true,"张伞");
59+
60+
$t1 = microtime(true);
61+
var_dump($tree->getTreeWord("zh"));
62+
$t2 = microtime(true);
63+
echo 'getTreeWordPinyin{' . ($t2 - $t1) . '}s'.PHP_EOL;
64+
65+
66+
//replace & delete
67+
$tree->append("z","",true,"在");
68+
$tree->append("z","",true,"走");
69+
$tree->append("z","",true,"做");
70+
var_dump($tree->getTreeWord("z",4));
71+
//覆盖
72+
$tree->append("z",array("1"=>1),true,"做");
73+
var_dump($tree->getTreeWord("z",4));
74+
//删除
75+
$tree->delete("z",false,true,"在");
76+
var_dump($tree->getTreeWord("z",4));
77+
$tree->delete("z",false,true,"走");
78+
$tree->delete("z",false,true,"做");
79+
var_dump($tree->getTreeWord("z", 4));
5080
```
5181

5282
## 使用场景
5383
- 敏感词过滤
5484
- 内链建设
85+
- 搜索框提示
5586

5687
## 性能
5788
test目录下有个1.5w左右的敏感词。

src/TrieTree.php

Lines changed: 93 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
*/
1616
class TrieTree {
1717
protected $nodeTree = [];
18-
protected $count = 0;
1918

2019
/**
2120
* 构造
@@ -27,10 +26,15 @@ public function __construct() {
2726

2827
/**
2928
* 从树种摘除一个文本
30-
* @param $str
29+
* @param $index_str
3130
*/
32-
public function delete($str, $deltree = false) {
33-
$str = trim($str);
31+
public function delete($index_str, $deltree = false, $is_py = false, $chinese = "") {
32+
$str = trim($index_str);
33+
$chinese = trim($chinese);
34+
if ($is_py && empty($chinese)) {
35+
return false;
36+
}
37+
3438
$delstr_arr = $this->convertStrToH($str);
3539
$len = count($delstr_arr);
3640
//提取树
@@ -64,10 +68,33 @@ public function delete($str, $deltree = false) {
6468
}
6569
//只有一个字 直接删除
6670
if ($idx == 0) {
67-
if (count($del_index[$idx]['index']['child']) == 0) {
68-
unset($this->nodeTree[$del_index[$idx]['code']]);
69-
return true;
71+
//如果是拼音 只删除相应的拼音索引
72+
if ($is_py) {
73+
//清除单个拼音索引
74+
if (isset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])) {
75+
$is_del = false;
76+
foreach ($this->nodeTree[$del_index[$idx]['code']]['chinese_list'] as $key=>$node) {
77+
if ($node['word'] == $chinese){
78+
unset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'][$key]);
79+
$is_del = true;
80+
break;
81+
}
82+
}
83+
if($is_del && 0 != count($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])){
84+
return true;
85+
}
86+
if(!$is_del){
87+
return false;
88+
}
89+
//如果依然存在中文数据 则继续向下跑删除节点
90+
}
91+
}else{
92+
if (count($del_index[$idx]['index']['child']) == 0) {
93+
unset($this->nodeTree[$del_index[$idx]['code']]);
94+
return true;
95+
}
7096
}
97+
7198
}
7299
//末梢为关键词结尾,且存在子集 清除结尾标签
73100
if (count($del_index[$idx]['index']['child']) > 0) {
@@ -98,12 +125,17 @@ public function delete($str, $deltree = false) {
98125
/**
99126
* ADD word [UTF8]
100127
* 增加新特性,在质感末梢增加自定义数组
101-
* @param $str 添加的词
128+
* @param $index_str 添加的词
102129
* @param array $data 添加词的附加属性
103130
* @return $this
104131
*/
105-
public function append($str, $data = array()) {
106-
$str = trim($str);
132+
public function append($index_str, $data = array(), $is_py = false, $chinese = '') {
133+
$str = trim($index_str);
134+
$chinese = trim($chinese);
135+
if ($is_py && empty($chinese)) {
136+
return false;
137+
}
138+
107139
$childTree = &$this->nodeTree;
108140
$len = strlen($str);
109141
for ($i = 0; $i < $len; $i++) {
@@ -137,19 +169,16 @@ public function append($str, $data = array()) {
137169
}
138170
if ($i == ($len - 1)) {
139171
$is_end = true;
172+
if ($is_py) {
173+
$str = $chinese;
174+
}
140175
}
141-
$childTree = &$this->_appendWordToTree($childTree, $code, $word, $is_end, $data, $str);
142-
176+
$childTree = &$this->_appendWordToTree($childTree, $code, $word, $is_end, $data, $str, $is_py);
143177
}
144-
$this->count++;
145178
unset($childTree);
146179
return $this;
147180
}
148181

149-
public function getCount() {
150-
return $this->count;
151-
}
152-
153182
/**
154183
* 追加一个字[中英文]到树中
155184
* @param $tree
@@ -160,18 +189,36 @@ public function getCount() {
160189
* @param string $full_str
161190
* @return mixed
162191
*/
163-
private function &_appendWordToTree(&$tree, $code, $word, $end = false, $data = array(), $full_str = '') {
192+
private function &_appendWordToTree(&$tree, $code, $word, $end = false, $data = array(), $full_str = '', $is_py) {
164193
if (!isset($tree[$code])) {
165194
$tree[$code] = array(
166195
'end' => $end,
167196
'child' => array(),
168-
'value' => $word
197+
'value' => $word,
169198
);
170199
}
171200
if ($end) {
172201
$tree[$code]['end'] = true;
173-
$tree[$code]['data'] = $data;
174-
$tree[$code]['full'] = $full_str;
202+
$tree[$code]['is_py'] = $is_py;
203+
//拼音不需要full 拼音根据读音多样性对应多个词语 重复词语覆盖data
204+
if ($is_py) {
205+
$is_change = false;
206+
if(isset($tree[$code]["chinese_list"]) && count($tree[$code]["chinese_list"])>0) {
207+
foreach ($tree[$code]["chinese_list"] as $key => &$node) {
208+
if ($node['word'] == $full_str) {
209+
$node['data'] = $data;
210+
$is_change = true;
211+
break;
212+
}
213+
}
214+
}
215+
if(!$is_change){
216+
$tree[$code]['chinese_list'][] = ["word" => $full_str, "data" => $data];
217+
}
218+
} else {
219+
$tree[$code]['full'] = $full_str;
220+
$tree[$code]['data'] = $data;
221+
}
175222
}
176223

177224
return $tree[$code]['child'];
@@ -185,13 +232,19 @@ public function getTree() {
185232
return $this->nodeTree;
186233
}
187234

188-
public function getTreeWord($word, $count = 0) {
235+
/**
236+
* 匹配下面的全部词语
237+
* @param $word
238+
* @param int $deep 检索深度 检索之后的词语数量可能会大于这个数字
239+
* @return array|bool
240+
*/
241+
public function getTreeWord($word, $deep = 0) {
189242
$search = trim($word);
190243
if (empty($search)) {
191244
return false;
192245
}
193-
if($count===0){
194-
$count = 9999;
246+
if ($deep === 0) {
247+
$deep = 999;
195248
}
196249

197250
$word_keys = $this->convertStrToH($search);
@@ -202,27 +255,35 @@ public function getTreeWord($word, $count = 0) {
202255
if (isset($tree[$val])) {
203256
//检测当前词语是否已命中
204257
if ($key == $key_count - 1 && $tree[$val]['end'] == true) {
205-
$words[] = ["word" => $tree[$val]['full'], "data" => $tree[$val]['data']];
258+
if (isset($tree[$val]['chinese_list'])) {
259+
$words = array_merge($words, $tree[$val]['chinese_list']);
260+
} else {
261+
$words[] = ["word" => $tree[$val]['full'], "data" => $tree[$val]['data']];
262+
}
206263
}
207264
$tree = &$tree[$val]["child"];
208-
}else{
265+
} else {
209266
//第一个字符都没有命中
210-
if($key == 0){
267+
if ($key == 0) {
211268
return [];
212269
}
213270
}
214271
}
215-
$this->_getTreeWord($tree, $count, $words);
272+
$this->_getTreeWord($tree, $deep, $words);
216273
return $words;
217274
}
218275

219-
private function _getTreeWord(&$child, $count, &$words = array()) {
276+
private function _getTreeWord(&$child, $deep, &$words = array()) {
220277
foreach ($child as $node) {
221278
if ($node['end'] == true) {
222-
$words[] = ["word" => $node['full'], "data" => $node['data']];
279+
if (isset($node['chinese_list'])) {
280+
$words = array_merge($words, $node['chinese_list']);
281+
} else {
282+
$words[] = ["word" => $node['full'], "data" => $node['data']];
283+
}
223284
}
224-
if (!empty($node['child']) && $count >= count($words)) {
225-
$this->_getTreeWord($node['child'], $count, $words);
285+
if (!empty($node['child']) && $deep >= count($words)) {
286+
$this->_getTreeWord($node['child'], $deep, $words);
226287
}
227288
}
228289
}

test/console.php

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,35 @@
6565
$t3 = microtime(true);
6666
echo 'DELETE RES:'.$del_res.PHP_EOL;
6767
var_dump($tree->search($str));
68+
6869
$t1 = microtime(true);
69-
var_dump($tree->getTreeWord(""));
70+
var_dump($tree->getTreeWord("a"));
7071
$t2 = microtime(true);
71-
echo 'SearchTime{' . ($t2 - $t1) . '}s'.PHP_EOL;
72+
echo 'getTreeWord{' . ($t2 - $t1) . '}s'.PHP_EOL;
73+
74+
75+
76+
//拼音检测
77+
$tree->append("zhangsan","",true,"张三");
78+
$tree->append("zhangsan","",true,"张伞");
79+
80+
$t1 = microtime(true);
81+
var_dump($tree->getTreeWord("zh"));
82+
$t2 = microtime(true);
83+
echo 'getTreeWordPinyin{' . ($t2 - $t1) . '}s'.PHP_EOL;
84+
85+
86+
//replace & delete
87+
$tree->append("z","",true,"");
88+
$tree->append("z","",true,"");
89+
$tree->append("z","",true,"");
90+
var_dump($tree->getTreeWord("z",4));
91+
//覆盖
92+
$tree->append("z",array("1"=>1),true,"");
93+
var_dump($tree->getTreeWord("z",4));
94+
//删除
95+
$tree->delete("z",false,true,"");
96+
var_dump($tree->getTreeWord("z",4));
97+
$tree->delete("z",false,true,"");
98+
$tree->delete("z",false,true,"");
99+
var_dump($tree->getTreeWord("z", 4));

0 commit comments

Comments
 (0)