Skip to content

Commit 8de3d19

Browse files
authored
Merge pull request #31 from dzhwinter/model_ltr2
Add the example for pairwise and listwise LTR.
2 parents d5cc115 + a87a3c9 commit 8de3d19

File tree

8 files changed

+669
-1
lines changed

8 files changed

+669
-1
lines changed

ltr/README.md

Lines changed: 368 additions & 1 deletion
Large diffs are not rendered by default.

ltr/image/lambdarank.jpg

26 KB
Loading

ltr/image/learningToRank.jpg

35.6 KB
Loading

ltr/image/ranknet.jpg

35.1 KB
Loading
79.4 KB
Loading

ltr/lambdaRank.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import os, sys
2+
import gzip
3+
import paddle.v2 as paddle
4+
import numpy as np
5+
import functools
6+
7+
#lambdaRank is listwise learning to rank model
8+
9+
10+
def lambdaRank(input_dim):
11+
"""
12+
lambdaRank is a ListWise Rank Model, input data and label must be sequence
13+
https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
14+
parameters :
15+
input_dim, one document's dense feature vector dimension
16+
17+
dense_vector_sequence format
18+
[[f, ...], [f, ...], ...], f is represent for an float or int number
19+
"""
20+
label = paddle.layer.data("label",
21+
paddle.data_type.dense_vector_sequence(1))
22+
data = paddle.layer.data("data",
23+
paddle.data_type.dense_vector_sequence(input_dim))
24+
25+
# hidden layer
26+
hd1 = paddle.layer.fc(
27+
input=data,
28+
size=128,
29+
act=paddle.activation.Tanh(),
30+
param_attr=paddle.attr.Param(initial_std=0.01))
31+
32+
hd2 = paddle.layer.fc(
33+
input=hd1,
34+
size=10,
35+
act=paddle.activation.Tanh(),
36+
param_attr=paddle.attr.Param(initial_std=0.01))
37+
output = paddle.layer.fc(
38+
input=hd2,
39+
size=1,
40+
act=paddle.activation.Linear(),
41+
param_attr=paddle.attr.Param(initial_std=0.01))
42+
43+
# evaluator
44+
evaluator = paddle.evaluator.auc(input=output, label=label)
45+
# cost layer
46+
cost = paddle.layer.lambda_cost(
47+
input=output, score=label, NDCG_num=6, max_sort_size=-1)
48+
return cost, output
49+
50+
51+
def train_lambdaRank(num_passes):
52+
# listwise input sequence
53+
fill_default_train = functools.partial(
54+
paddle.dataset.mq2007.train, format="listwise")
55+
fill_default_test = functools.partial(
56+
paddle.dataset.mq2007.test, format="listwise")
57+
train_reader = paddle.batch(
58+
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
59+
test_reader = paddle.batch(fill_default_test, batch_size=32)
60+
61+
# mq2007 input_dim = 46, dense format
62+
input_dim = 46
63+
cost, output = lambdaRank(input_dim)
64+
parameters = paddle.parameters.create(cost)
65+
66+
trainer = paddle.trainer.SGD(
67+
cost=cost,
68+
parameters=parameters,
69+
update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
70+
71+
# Define end batch and end pass event handler
72+
def event_handler(event):
73+
if isinstance(event, paddle.event.EndIteration):
74+
print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id,
75+
event.cost)
76+
if isinstance(event, paddle.event.EndPass):
77+
result = trainer.test(reader=test_reader, feeding=feeding)
78+
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
79+
with gzip.open("lambdaRank_params_%d.tar.gz" % (event.pass_id),
80+
"w") as f:
81+
parameters.to_tar(f)
82+
83+
feeding = {"label": 0, "data": 1}
84+
trainer.train(
85+
reader=train_reader,
86+
event_handler=event_handler,
87+
feeding=feeding,
88+
num_passes=num_passes)
89+
90+
91+
def lambdaRank_infer(pass_id):
92+
"""
93+
lambdaRank model inference interface
94+
parameters:
95+
pass_id : inference model in pass_id
96+
"""
97+
print "Begin to Infer..."
98+
input_dim = 46
99+
output = lambdaRank(input_dim)
100+
parameters = paddle.parameters.Parameters.from_tar(
101+
gzip.open("lambdaRank_params_%d.tar.gz" % (pass_id - 1)))
102+
103+
infer_query_id = None
104+
infer_data = []
105+
infer_data_num = 1
106+
fill_default_test = functools.partial(
107+
paddle.dataset.mq2007.test, format="listwise")
108+
for label, querylist in fill_default_test():
109+
infer_data.append(querylist)
110+
if len(infer_data) == infer_data_num:
111+
break
112+
113+
# predict score of infer_data document. Re-sort the document base on predict score
114+
# in descending order. then we build the ranking documents
115+
predicitons = paddle.infer(
116+
output_layer=output, parameters=parameters, input=infer_data)
117+
for i, score in enumerate(predicitons):
118+
print i, score
119+
120+
121+
if __name__ == '__main__':
122+
paddle.init(use_gpu=False, trainer_count=4)
123+
train_lambdaRank(2)
124+
lambdaRank_infer(pass_id=1)

ltr/metrics.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import numpy as np
2+
import unittest
3+
4+
5+
def ndcg(score_list):
6+
"""
7+
measure the ndcg score of order list
8+
https://en.wikipedia.org/wiki/Discounted_cumulative_gain
9+
parameter:
10+
score_list: np.array, shape=(sample_num,1)
11+
12+
e.g. predict rank score list :
13+
>>> scores = [3, 2, 3, 0, 1, 2]
14+
>>> ndcg_score = ndcg(scores)
15+
16+
"""
17+
18+
def dcg(score_list):
19+
n = len(score_list)
20+
cost = .0
21+
for i in range(n):
22+
cost += float(score_list[i]) / np.log((i + 1) + 1)
23+
return cost
24+
25+
dcg_cost = dcg(score_list)
26+
score_ranking = sorted(score_list, reverse=True)
27+
ideal_cost = dcg(score_ranking)
28+
return dcg_cost / ideal_cost
29+
30+
31+
class NdcgTest(unittest.TestCase):
32+
def __init__(self):
33+
pass
34+
35+
def runcase(self):
36+
a = [3, 2, 3, 0, 1, 2]
37+
value = ndcg(a)
38+
self.assertAlmostEqual(0.961, value, places=3)
39+
40+
41+
if __name__ == '__main__':
42+
unittest.main()

ltr/ranknet.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import os
2+
import sys
3+
import gzip
4+
import functools
5+
import paddle.v2 as paddle
6+
import numpy as np
7+
from metrics import ndcg
8+
9+
# ranknet is the classic pairwise learning to rank algorithm
10+
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
11+
12+
13+
def half_ranknet(name_prefix, input_dim):
14+
"""
15+
parameter in same name will be shared in paddle framework,
16+
these parameters in ranknet can be used in shared state, e.g. left network and right network
17+
shared parameters in detail
18+
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
19+
"""
20+
# data layer
21+
data = paddle.layer.data(name_prefix + "/data",
22+
paddle.data_type.dense_vector(input_dim))
23+
24+
# hidden layer
25+
hd1 = paddle.layer.fc(
26+
input=data,
27+
size=10,
28+
act=paddle.activation.Tanh(),
29+
param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
30+
# fully connect layer/ output layer
31+
output = paddle.layer.fc(
32+
input=hd1,
33+
size=1,
34+
act=paddle.activation.Linear(),
35+
param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
36+
return output
37+
38+
39+
def ranknet(input_dim):
40+
# label layer
41+
label = paddle.layer.data("label", paddle.data_type.dense_vector(1))
42+
43+
# reuse the parameter in half_ranknet
44+
output_left = half_ranknet("left", input_dim)
45+
output_right = half_ranknet("right", input_dim)
46+
47+
evaluator = paddle.evaluator.auc(input=output_left, label=label)
48+
# rankcost layer
49+
cost = paddle.layer.rank_cost(
50+
name="cost", left=output_left, right=output_right, label=label)
51+
return cost
52+
53+
54+
def train_ranknet(num_passes):
55+
train_reader = paddle.batch(
56+
paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100),
57+
batch_size=100)
58+
test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100)
59+
60+
# mq2007 feature_dim = 46, dense format
61+
# fc hidden_dim = 128
62+
feature_dim = 46
63+
cost = ranknet(feature_dim)
64+
parameters = paddle.parameters.create(cost)
65+
66+
trainer = paddle.trainer.SGD(
67+
cost=cost,
68+
parameters=parameters,
69+
update_equation=paddle.optimizer.Adam(learning_rate=2e-4))
70+
71+
# Define the input data order
72+
feeding = {"label": 0, "left/data": 1, "right/data": 2}
73+
74+
# Define end batch and end pass event handler
75+
def event_handler(event):
76+
if isinstance(event, paddle.event.EndIteration):
77+
if event.batch_id % 100 == 0:
78+
print "Pass %d Batch %d Cost %.9f" % (
79+
event.pass_id, event.batch_id, event.cost)
80+
else:
81+
sys.stdout.write(".")
82+
sys.stdout.flush()
83+
if isinstance(event, paddle.event.EndPass):
84+
result = trainer.test(reader=test_reader, feeding=feeding)
85+
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
86+
with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id),
87+
"w") as f:
88+
parameters.to_tar(f)
89+
90+
trainer.train(
91+
reader=train_reader,
92+
event_handler=event_handler,
93+
feeding=feeding,
94+
num_passes=num_passes)
95+
96+
97+
def ranknet_infer(pass_id):
98+
"""
99+
load the trained model. And predict with plain txt input
100+
"""
101+
print "Begin to Infer..."
102+
feature_dim = 46
103+
104+
# we just need half_ranknet to predict a rank score, which can be used in sort documents
105+
output = half_ranknet("left", feature_dim)
106+
parameters = paddle.parameters.Parameters.from_tar(
107+
gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1)))
108+
109+
# load data of same query and relevance documents, need ranknet to rank these candidates
110+
infer_query_id = []
111+
infer_data = []
112+
infer_doc_index = []
113+
114+
# convert to mq2007 built-in data format
115+
# <query_id> <relevance_score> <feature_vector>
116+
plain_txt_test = functools.partial(
117+
paddle.dataset.mq2007.test, format="plain_txt")
118+
119+
for query_id, relevance_score, feature_vector in plain_txt_test():
120+
infer_query_id.append(query_id)
121+
infer_data.append(feature_vector)
122+
123+
# predict score of infer_data document. Re-sort the document base on predict score
124+
# in descending order. then we build the ranking documents
125+
scores = paddle.infer(
126+
output_layer=output, parameters=parameters, input=infer_data)
127+
for query_id, score in zip(infer_query_id, scores):
128+
print "query_id : ", query_id, " ranknet rank document order : ", score
129+
130+
131+
if __name__ == '__main__':
132+
paddle.init(use_gpu=False, trainer_count=4)
133+
pass_num = 2
134+
train_ranknet(pass_num)
135+
ranknet_infer(pass_id=pass_num - 1)

0 commit comments

Comments
 (0)