PaddlePaddle · lcy-seso · Jul 19, 2017 · Jul 19, 2017 · Jul 19, 2017 · Jul 19, 2017
diff --git a/dssm/README.md b/dssm/README.md
@@ -384,11 +384,13 @@ def _build_rank_model(self):
 ```
 usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
  [-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
- [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE --model_arch
- MODEL_ARCH
+ [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
  [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
  [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
  [--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
+ [--model_output_prefix MODEL_OUTPUT_PREFIX]
+ [-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
+ [-z NUM_BATCHES_TO_SAVE_MODEL]
 
 PaddlePaddle DSSM example
 
@@ -408,9 +410,9 @@ optional arguments:
  -p NUM_PASSES, --num_passes NUM_PASSES
  number of passes to run(default:10)
  -y MODEL_TYPE, --model_type MODEL_TYPE
- model type, 0 for classification, 1 for pairwise rank
- (default: classification)
- --model_arch MODEL_ARCH
+ model type, 0 for classification, 1 for pairwise rank,
+ 2 for regression (default: classification)
+ -a MODEL_ARCH, --model_arch MODEL_ARCH
  model architecture, 1 for CNN, 0 for FC, 2 for RNN
  --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
  whether to share network parameters between source and
@@ -426,8 +428,73 @@ optional arguments:
  --use_gpu USE_GPU whether to use GPU devices (default: False)
  -c CLASS_NUM, --class_num CLASS_NUM
  number of categories for classification task.
+ --model_output_prefix MODEL_OUTPUT_PREFIX
+ prefix of the path for model to store, (default: ./)
+ -g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
+ number of batches to output train log, (default: 100)
+ -e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
+ number of batches to test, (default: 200)
+ -z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
+ number of batches to output model, (default: 400)
 ```
 
+重要的参数描述如下
+
+- `train_data_path` 训练数据路径
+- `test_data_path` 测试数据路局，可以不设置
+- `source_dic_path` 源字典字典路径
+- `target_dic_path` 目标字典路径
+- `model_type` 模型的损失函数的类型，分类0，排序1，回归2
+- `model_arch` 模型结构，FC 0， CNN 1, RNN 2
+- `dnn_dims` 模型各层的维度设置，默认为 `256,128,64,32`，即模型有4层，各层维度如上设置
+
+## 用训练好的模型预测
+```
+usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
+ PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
+ [--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
+ [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
+ [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
+ [-c CLASS_NUM]
+
+PaddlePaddle DSSM infer
+
+optional arguments:
+ -h, --help show this help message and exit
+ --model_path MODEL_PATH
+ path of model parameters file
+ -i DATA_PATH, --data_path DATA_PATH
+ path of the dataset to infer
+ -o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
+ path to output the prediction
+ -y MODEL_TYPE, --model_type MODEL_TYPE
+ model type, 0 for classification, 1 for pairwise rank,
+ 2 for regression (default: classification)
+ -s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
+ path of the source's word dic
+ --target_dic_path TARGET_DIC_PATH
+ path of the target's word dic, if not set, the
+ `source_dic_path` will be used
+ -a MODEL_ARCH, --model_arch MODEL_ARCH
+ model architecture, 1 for CNN, 0 for FC, 2 for RNN
+ --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
+ whether to share network parameters between source and
+ target
+ --share_embed SHARE_EMBED
+ whether to share word embedding between source and
+ target
+ --dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
+ which means create a 4-layer dnn, demention of each
+ layer is 256, 128, 64 and 32
+ -c CLASS_NUM, --class_num CLASS_NUM
+ number of categories for classification task.
+```
+
+部分参数可以参考 `train.py`，重要参数解释如下
+
+- `data_path` 需要预测的数据路径
+- `prediction_output_path` 预测的输出路径
+
 ## 参考文献
 
 1. Huang P S, He X, Gao J, et al. Learning deep structured semantic models for web search using clickthrough data[C]//Proceedings of the 22nd ACM international conference on Conference on information & knowledge management. ACM, 2013: 2333-2338.

diff --git a/dssm/infer.py b/dssm/infer.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import itertools
+
+import reader
+import paddle.v2 as paddle
+from network_conf import DSSM
+from utils import logger, ModelType, ModelArch, load_dic
+
+parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer")
+parser.add_argument(
+ '--model_path',
+ type=str,
+ required=True,
+ help="path of model parameters file")
+parser.add_argument(
+ '-i',
+ '--data_path',
+ type=str,
+ required=True,
+ help="path of the dataset to infer")
+parser.add_argument(
+ '-o',
+ '--prediction_output_path',
+ type=str,
+ required=True,
+ help="path to output the prediction")
+parser.add_argument(
+ '-y',
+ '--model_type',
+ type=int,
+ required=True,
+ default=ModelType.CLASSIFICATION_MODE,
+ help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)"
+ % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
+ ModelType.REGRESSION_MODE))
+parser.add_argument(
+ '-s',
+ '--source_dic_path',
+ type=str,
+ required=False,
+ help="path of the source's word dic")
+parser.add_argument(
+ '--target_dic_path',
+ type=str,
+ required=False,
+ help="path of the target's word dic, if not set, the `source_dic_path` will be used"
+)
+parser.add_argument(
+ '-a',
+ '--model_arch',
+ type=int,
+ required=True,
+ default=ModelArch.CNN_MODE,
+ help="model architecture, %d for CNN, %d for FC, %d for RNN" %
+ (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
+parser.add_argument(
+ '--share_network_between_source_target',
+ type=bool,
+ default=False,
+ help="whether to share network parameters between source and target")
+parser.add_argument(
+ '--share_embed',
+ type=bool,
+ default=False,
+ help="whether to share word embedding between source and target")
+parser.add_argument(
+ '--dnn_dims',
+ type=str,
+ default='256,128,64,32',
+ help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32"
+)
+parser.add_argument(
+ '-c',
+ '--class_num',
+ type=int,
+ default=0,
+ help="number of categories for classification task.")
+
+args = parser.parse_args()
+args.model_type = ModelType(args.model_type)
+args.model_arch = ModelArch(args.model_arch)
+if args.model_type.is_classification():
+ assert args.class_num > 1, "--class_num should be set in classification task."
+
+layer_dims = map(int, args.dnn_dims.split(','))
+args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
+
+paddle.init(use_gpu=False, trainer_count=1)
+
+
+class Inferer(object):
+ def __init__(self, param_path):
+ logger.info("create DSSM model")
+
+ cost, prediction, label = DSSM(
+ dnn_dims=layer_dims,
+ vocab_sizes=[
+ len(load_dic(path))
+ for path in [args.source_dic_path, args.target_dic_path]
+ ],
+ model_type=args.model_type,
+ model_arch=args.model_arch,
+ share_semantic_generator=args.share_network_between_source_target,
+ class_num=args.class_num,
+ share_embed=args.share_embed)()
+
+ # load parameter
+ logger.info("load model parameters from %s" % param_path)
+ self.parameters = paddle.parameters.Parameters.from_tar(
+ open(param_path, 'r'))
+ self.inferer = paddle.inference.Inference(
+ output_layer=prediction, parameters=self.parameters)
+
+ def infer(self, data_path):
+ logger.info("infer data...")
+ dataset = reader.Dataset(
+ train_path=data_path,
+ test_path=None,
+ source_dic_path=args.source_dic_path,
+ target_dic_path=args.target_dic_path,
+ model_type=args.model_type, )
+ infer_reader = paddle.batch(dataset.infer, batch_size=1000)
+ logger.warning('write predictions to %s' % args.prediction_output_path)
+
+ output_f = open(args.prediction_output_path, 'w')
+
+ for id, batch in enumerate(infer_reader()):
+ res = self.inferer.infer(input=batch)
+ predictions = [' '.join(map(str, x)) for x in res]
+ assert len(batch) == len(
+ predictions), "predict error, %d inputs, but %d predictions" % (
+ len(batch), len(predictions))
+ output_f.write('\n'.join(map(str, predictions)) + '\n')
+
+
+if __name__ == '__main__':
+ inferer = Inferer(args.model_path)
+ inferer.infer(args.data_path)
diff --git a/dssm/network_conf.py b/dssm/network_conf.py
@@ -11,7 +11,8 @@ def __init__(self,
  model_arch=ModelArch.create_cnn(),
  share_semantic_generator=False,
  class_num=None,
- share_embed=False):
+ share_embed=False,
+ is_infer=False):
  '''
  @dnn_dims: list of int
  dimentions of each layer in semantic vector generator.
@@ -40,6 +41,7 @@ def __init__(self,
  self.model_type = ModelType(model_type)
  self.model_arch = ModelArch(model_arch)
  self.class_num = class_num
+ self.is_infer = is_infer
  logger.warning("build DSSM model with config of %s, %s" %
  (self.model_type, self.model_arch))
  logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))
@@ -68,9 +70,6 @@ def _model_arch_creater(emb, prefix=''):
  self.model_type_creater = _model_type[str(self.model_type)]
 
  def __call__(self):
- # if self.model_type.is_classification():
- # return self._build_classification_model()
- # return self._build_rank_model()
  return self.model_type_creater()
 
  def create_embedding(self, input, prefix=''):
@@ -189,8 +188,9 @@ def _build_rank_model(self):
  right_target = paddle.layer.data(
  name='right_target_input',
  type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
- label = paddle.layer.data(
- name='label_input', type=paddle.data_type.integer_value(1))
+ if not self.is_infer:
+ label = paddle.layer.data(
+ name='label_input', type=paddle.data_type.integer_value(1))
 
  prefixs = '_ _ _'.split(
  ) if self.share_semantic_generator else 'source left right'.split()
@@ -212,12 +212,14 @@ def _build_rank_model(self):
  # cossim score of source and right target
  right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
 
- # rank cost
- cost = paddle.layer.rank_cost(left_score, right_score, label=label)
- # prediction = left_score - right_score
- # but this operator is not supported currently.
- # so AUC will not used.
- return cost, None, None
+ if not self.is_infer:
+ # rank cost
+ cost = paddle.layer.rank_cost(left_score, right_score, label=label)
+ # prediction = left_score - right_score
+ # but this operator is not supported currently.
+ # so AUC will not used.
+ return cost, None, label
+ return None, [left_score, right_score], label
 
  def _build_classification_or_regression_model(self, is_classification):
  '''
@@ -270,38 +272,7 @@ def _build_classification_or_regression_model(self, is_classification):
  else:
  prediction = paddle.layer.cos_sim(*semantics)
  cost = paddle.layer.mse_cost(prediction, label)
- return cost, prediction, label
-
-
-class RankMetrics(object):
- '''
- A custom metrics to calculate AUC.
 
- Paddle's rank model do not support auc evaluator directly,
- to make it, infer all the outputs and use python to calculate
- the metrics.
- '''
-
- def __init__(self, model_parameters, left_score_layer, right_score_layer,
- label):
- '''
- @model_parameters: dict
- model's parameters
- @left_score_layer: paddle.layer
- left part's score
- @right_score_laeyr: paddle.layer
- right part's score
- @label: paddle.data_layer
- label input
- '''
- self.inferer = paddle.inference.Inference(
- output_layer=[left_score_layer, right_score_layer],
- parameters=model_parameters)
-
- def test(self, input):
- scores = []
- for id, rcd in enumerate(input()):
- # output [left_score, right_score, label]
- res = self.inferer(input=input)
- scores.append(res)
- print scores
+ if not self.is_infer:
+ return cost, prediction, label
+ return None, prediction, label