|  | 
|  | 1 | +import os | 
|  | 2 | +import sys | 
|  | 3 | +import gzip | 
|  | 4 | +import functools | 
|  | 5 | +import paddle.v2 as paddle | 
|  | 6 | +import numpy as np | 
|  | 7 | +from metrics import ndcg | 
|  | 8 | + | 
|  | 9 | +# ranknet is the classic pairwise learning to rank algorithm | 
|  | 10 | +# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf | 
|  | 11 | + | 
|  | 12 | + | 
|  | 13 | +def half_ranknet(name_prefix, input_dim): | 
|  | 14 | + """ | 
|  | 15 | + parameter in same name will be shared in paddle framework, | 
|  | 16 | + these parameters in ranknet can be used in shared state, e.g. left network and right network | 
|  | 17 | + shared parameters in detail | 
|  | 18 | + https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md | 
|  | 19 | + """ | 
|  | 20 | + # data layer | 
|  | 21 | + data = paddle.layer.data(name_prefix + "/data", | 
|  | 22 | + paddle.data_type.dense_vector(input_dim)) | 
|  | 23 | + | 
|  | 24 | + # hidden layer | 
|  | 25 | + hd1 = paddle.layer.fc( | 
|  | 26 | + input=data, | 
|  | 27 | + size=10, | 
|  | 28 | + act=paddle.activation.Tanh(), | 
|  | 29 | + param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1")) | 
|  | 30 | + # fully connect layer/ output layer | 
|  | 31 | + output = paddle.layer.fc( | 
|  | 32 | + input=hd1, | 
|  | 33 | + size=1, | 
|  | 34 | + act=paddle.activation.Linear(), | 
|  | 35 | + param_attr=paddle.attr.Param(initial_std=0.01, name="output")) | 
|  | 36 | + return output | 
|  | 37 | + | 
|  | 38 | + | 
|  | 39 | +def ranknet(input_dim): | 
|  | 40 | + # label layer | 
|  | 41 | + label = paddle.layer.data("label", paddle.data_type.dense_vector(1)) | 
|  | 42 | + | 
|  | 43 | + # reuse the parameter in half_ranknet | 
|  | 44 | + output_left = half_ranknet("left", input_dim) | 
|  | 45 | + output_right = half_ranknet("right", input_dim) | 
|  | 46 | + | 
|  | 47 | + evaluator = paddle.evaluator.auc(input=output_left, label=label) | 
|  | 48 | + # rankcost layer | 
|  | 49 | + cost = paddle.layer.rank_cost( | 
|  | 50 | + name="cost", left=output_left, right=output_right, label=label) | 
|  | 51 | + return cost | 
|  | 52 | + | 
|  | 53 | + | 
|  | 54 | +def train_ranknet(num_passes): | 
|  | 55 | + train_reader = paddle.batch( | 
|  | 56 | + paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100), | 
|  | 57 | + batch_size=100) | 
|  | 58 | + test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100) | 
|  | 59 | + | 
|  | 60 | + # mq2007 feature_dim = 46, dense format | 
|  | 61 | + # fc hidden_dim = 128 | 
|  | 62 | + feature_dim = 46 | 
|  | 63 | + cost = ranknet(feature_dim) | 
|  | 64 | + parameters = paddle.parameters.create(cost) | 
|  | 65 | + | 
|  | 66 | + trainer = paddle.trainer.SGD( | 
|  | 67 | + cost=cost, | 
|  | 68 | + parameters=parameters, | 
|  | 69 | + update_equation=paddle.optimizer.Adam(learning_rate=2e-4)) | 
|  | 70 | + | 
|  | 71 | + # Define the input data order | 
|  | 72 | + feeding = {"label": 0, "left/data": 1, "right/data": 2} | 
|  | 73 | + | 
|  | 74 | + # Define end batch and end pass event handler | 
|  | 75 | + def event_handler(event): | 
|  | 76 | + if isinstance(event, paddle.event.EndIteration): | 
|  | 77 | + if event.batch_id % 100 == 0: | 
|  | 78 | + print "Pass %d Batch %d Cost %.9f" % ( | 
|  | 79 | + event.pass_id, event.batch_id, event.cost) | 
|  | 80 | + else: | 
|  | 81 | + sys.stdout.write(".") | 
|  | 82 | + sys.stdout.flush() | 
|  | 83 | + if isinstance(event, paddle.event.EndPass): | 
|  | 84 | + result = trainer.test(reader=test_reader, feeding=feeding) | 
|  | 85 | + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) | 
|  | 86 | + with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id), | 
|  | 87 | + "w") as f: | 
|  | 88 | + parameters.to_tar(f) | 
|  | 89 | + | 
|  | 90 | + trainer.train( | 
|  | 91 | + reader=train_reader, | 
|  | 92 | + event_handler=event_handler, | 
|  | 93 | + feeding=feeding, | 
|  | 94 | + num_passes=num_passes) | 
|  | 95 | + | 
|  | 96 | + | 
|  | 97 | +def ranknet_infer(pass_id): | 
|  | 98 | + """ | 
|  | 99 | + load the trained model. And predict with plain txt input | 
|  | 100 | + """ | 
|  | 101 | + print "Begin to Infer..." | 
|  | 102 | + feature_dim = 46 | 
|  | 103 | + | 
|  | 104 | + # we just need half_ranknet to predict a rank score, which can be used in sort documents | 
|  | 105 | + output = half_ranknet("left", feature_dim) | 
|  | 106 | + parameters = paddle.parameters.Parameters.from_tar( | 
|  | 107 | + gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1))) | 
|  | 108 | + | 
|  | 109 | + # load data of same query and relevance documents, need ranknet to rank these candidates | 
|  | 110 | + infer_query_id = [] | 
|  | 111 | + infer_data = [] | 
|  | 112 | + infer_doc_index = [] | 
|  | 113 | + | 
|  | 114 | + # convert to mq2007 built-in data format | 
|  | 115 | + # <query_id> <relevance_score> <feature_vector> | 
|  | 116 | + plain_txt_test = functools.partial( | 
|  | 117 | + paddle.dataset.mq2007.test, format="plain_txt") | 
|  | 118 | + | 
|  | 119 | + for query_id, relevance_score, feature_vector in plain_txt_test(): | 
|  | 120 | + infer_query_id.append(query_id) | 
|  | 121 | + infer_data.append(feature_vector) | 
|  | 122 | + | 
|  | 123 | + # predict score of infer_data document. Re-sort the document base on predict score | 
|  | 124 | + # in descending order. then we build the ranking documents | 
|  | 125 | + scores = paddle.infer( | 
|  | 126 | + output_layer=output, parameters=parameters, input=infer_data) | 
|  | 127 | + for query_id, score in zip(infer_query_id, scores): | 
|  | 128 | + print "query_id : ", query_id, " ranknet rank document order : ", score | 
|  | 129 | + | 
|  | 130 | + | 
|  | 131 | +if __name__ == '__main__': | 
|  | 132 | + paddle.init(use_gpu=False, trainer_count=4) | 
|  | 133 | + pass_num = 2 | 
|  | 134 | + train_ranknet(pass_num) | 
|  | 135 | + ranknet_infer(pass_id=pass_num - 1) | 
0 commit comments