Skip to content

Commit 9e8561d

Browse files
committed
new scoring engines
1 parent 395a877 commit 9e8561d

File tree

2 files changed

+95
-15
lines changed

2 files changed

+95
-15
lines changed

lib/lib.js

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ ExtractContentJS.Lib.Util = (function() {
4040
self.stop = function(name){ return self.get(name).stop(); };
4141
return self;
4242
};
43-
Util.Kind = function(word) {
43+
Util.Token = function(word) {
4444
var regex = {
4545
hiragana: /[---]/,
4646
katakana: /[---]/,
@@ -63,7 +63,7 @@ ExtractContentJS.Lib.Util = (function() {
6363
first: tests(word.charAt(0)),
6464
last: tests(word.charAt(word.length-1))
6565
};
66-
self.tokenized = function(prev, next) {
66+
self.isTokenized = function(prev, next) {
6767
var p = prev.length ? prev.charAt(prev.length-1) : '';
6868
var n = next.length ? next.charAt(0) : '';
6969
var check = function(w, test) {
@@ -76,6 +76,7 @@ ExtractContentJS.Lib.Util = (function() {
7676
};
7777
return check(p, self.first) && check(n, self.last);
7878
};
79+
7980
return self;
8081
};
8182
Util.inherit = function(child,parent) {
@@ -101,15 +102,27 @@ ExtractContentJS.Lib.Util = (function() {
101102
Util.countMatchTokenized = function(text, word) {
102103
var count = 0;
103104
var prev = null;
104-
var kind = new Util.Kind(word);
105+
var tok = new Util.Token(word);
105106
var texts = text.split(word);
106107
var len = texts.length;
107108
for (var i=0; i < len; i++) {
108-
if (prev && kind.tokenized(prev, texts[i])) count++;
109+
if (prev && tok.isTokenized(prev, texts[i])) count++;
109110
prev = texts[i]
110111
}
111112
return count;
112113
};
114+
Util.indexOfTokenized = function(text, word) {
115+
var index = text.indexOf(word);
116+
if (index >= 0) {
117+
var tok = new Util.Token(word);
118+
var p = index > 1 ? text.substr(index-1, 1) : '';
119+
var n = text.substr(index+word.length, 1);
120+
if (tok.isTokenized(p, n)) {
121+
return index;
122+
}
123+
}
124+
return -1;
125+
};
113126
Util.dump = function(obj) {
114127
if (typeof obj == 'undefined') return 'undefined';
115128
if (typeof obj == 'string') return '"' + obj + '"';

lib/scoring-words.js

Lines changed: 78 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ if (typeof ExtractContentJS == 'undefined') {
1515
if (typeof ns.RelativeWords.Engine != 'undefined') {
1616
return new ns.RelativeWords.Engine[name];
1717
}
18+
return null;
1819
}
1920
};
2021

@@ -52,12 +53,16 @@ if (typeof ExtractContentJS == 'undefined') {
5253
text: 32768
5354
}
5455
};
55-
var self = { weight: opt.weight || 0.3 };
56+
var self = { weight: opt.weight || 0.4 };
5657

5758
self.vote = function(doc, words) {
5859
var total = 0;
5960
var max = 0;
6061
var scores = {};
62+
var content = (doc.content+'').substr(0, opt.limit.text);
63+
content = content.toLowerCase();
64+
var title = (doc.title||'').toLowerCase();
65+
var url = (doc.url||'').toLowerCase();
6166
for (var t in words) total += words[t].df;
6267
for (var t in words) {
6368
var df = words[t].df;
@@ -67,20 +72,47 @@ if (typeof ExtractContentJS == 'undefined') {
6772

6873
var tf = 0;
6974
var w = t.toLowerCase();
70-
var text = (doc.content+'').substr(0, opt.limit.text);
71-
tf += Util.countMatchTokenized(text.toLowerCase(), w);
72-
if (doc.title) {
73-
tf += Util.countMatchTokenized(doc.title.toLowerCase(), w);
74-
}
75-
if (doc.url) {
76-
tf += Util.countMatchTokenized(doc.url.toLowerCase(), w);
77-
}
75+
tf += Util.countMatchTokenized(content, w);
76+
tf += Util.countMatchTokenized(title, w);
77+
tf += Util.countMatchTokenized(url, w);
7878

7979
scores[t] = tf/idf;
8080
if (scores[t] > max) max = scores[t];
8181
}
8282
if (!max) return;
83+
for (var t in scores) {
84+
var score = scores[t] / max; // normalize
85+
words[t].score += score * self.weight;
86+
}
87+
};
88+
89+
return self;
90+
};
91+
92+
ns.RelativeWords.Engine.ContentPosition = function() {
93+
var opt = arguments[0] || {
94+
limit: {
95+
text: 32768
96+
}
97+
};
98+
var self = { weight: opt.weight || 0.1 };
99+
100+
self.vote = function(doc, words) {
101+
var max = 0;
102+
var scores = {};
103+
var content = (doc.content+'').substr(0, opt.limit.text);
104+
content = content.toLowerCase();
83105
for (var t in words) {
106+
var w = t.toLowerCase();
107+
var index = Util.indexOfTokenized(content, w);
108+
if (index >= 0) {
109+
scores[t] = scores[t] || 0;
110+
scores[t] += 1.0 / (index+1);
111+
if (max < scores[t]) max = scores[t];
112+
}
113+
}
114+
if (!max) return;
115+
for (var t in scores) {
84116
var score = scores[t] / max; // normalize
85117
words[t].score += score * self.weight;
86118
}
@@ -89,11 +121,46 @@ if (typeof ExtractContentJS == 'undefined') {
89121
return self;
90122
};
91123

124+
ns.RelativeWords.Engine.TitlePosition = function() {
125+
var opt = arguments[0] || {
126+
limit: {
127+
text: 32768
128+
}
129+
};
130+
var self = {
131+
weight: {
132+
global: (opt.weight && opt.weight.global) || 0.4,
133+
title: (opt.weight && opt.weight.title) || 0.35
134+
}
135+
};
136+
137+
self.vote = function(doc, words) {
138+
var max = 0;
139+
var scores = {};
140+
var title = (doc.title||'').toLowerCase();
141+
for (var t in words) {
142+
var w = t.toLowerCase();
143+
var index = Util.indexOfTokenized(title, w);
144+
if (index >= 0) {
145+
scores[t] = 1 + self.weight.title / (1+Math.log(index+1));
146+
if (max < scores[t]) max = scores[t];
147+
}
148+
}
149+
if (!max) return;
150+
for (var t in scores) {
151+
var score = scores[t] / max; // normalize
152+
words[t].score += score * self.weight.global;
153+
}
154+
};
155+
156+
return self;
157+
};
158+
92159
ns.suggestTags = function(url, title, body, tags) {
93160
var sc = new ns.RelativeWords();
94161
sc.addEngine( sc.factory.getEngine('TfIdf') );
95-
// sc.addEngine( sc.factory.getEngine('Position') );
96-
// sc.addEngine( sc.factory.getEngine('URL') );
162+
sc.addEngine( sc.factory.getEngine('ContentPosition') );
163+
sc.addEngine( sc.factory.getEngine('TitlePosition') );
97164
return sc.top({ url: url, title: title, content: body }, tags);
98165
};
99166

0 commit comments

Comments
 (0)