PaddlePaddle
diff --git a/‎fluid/NMT_Transformer/README.md‎
Lines changed: 17 additions & 3 deletions b/‎fluid/NMT_Transformer/README.md‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎fluid/NMT_Transformer/config.py‎
Lines changed: 72 additions & 46 deletions b/‎fluid/NMT_Transformer/config.py‎
Lines changed: 72 additions & 46 deletions
@@ -1,5 +1,19 @@
-# Transformer
+# Attention is All You Need: A Paddle Fluid implementation
 
-Set the model and training configurations in `config.py`, and execute `python train.py` to train.
+This is a Paddle Fluid implementation of the Transformer model in [Attention is All You Need]() (Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin, arxiv, 2017).
 
-More details to be added.
+If you use the dataset/code in your research, please cite the paper:
+
+```text
+@inproceedings{vaswani2017attention,
+ title={Attention is all you need},
+ author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
+ booktitle={Advances in Neural Information Processing Systems},
+ pages={6000--6010},
+ year={2017}
+}
+```
+
+### TODO
+
+This project is still under active development.
@@ -1,47 +1,73 @@
-# Represent the dict sizes of source and target language. The dict from the
-# dataset here used includes the <bos>, <eos> and <unk> token but exlcudes
-# the <pad> token. It should plus 1 to include the padding token when used as
-# the size of lookup table.
-src_vocab_size = 10000
-trg_vocab_size = 10000
-# Represent the id of <pad> token in source language.
-src_pad_idx = src_vocab_size
-# Represent the id of <pad> token in target language.
-trg_pad_idx = trg_vocab_size
-# Represent the position value corresponding to the <pad> token.
-pos_pad_idx = 0
-# Represent the max length of sequences. It should plus 1 to include position
-# padding token for position encoding.
-max_length = 50
-# Represent the epoch number to train.
-pass_num = 2
-# Represent the number of sequences contained in a mini-batch.
-batch_size = 64
-# Reprent the params for Adam optimizer.
-learning_rate = 0.001
-beta1 = 0.9
-beta2 = 0.98
-eps = 1e-9
-# Represent the dimension of embeddings, which is also the last dimension of
-# the input and output of multi-head attention, position-wise feed-forward
-# networks, encoder and decoder.
-d_model = 512
-# Represent the size of the hidden layer in position-wise feed-forward networks.
-d_inner_hid = 1024
-# Represent the dimension keys are projected to for dot-product attention.
-d_key = 64
-# Represent the dimension values are projected to for dot-product attention.
-d_value = 64
-# Represent the number of head used in multi-head attention.
-n_head = 8
-# Represent the number of sub-layers to be stacked in the encoder and decoder.
-n_layer = 6
-# Represent the dropout rate used by all dropout layers.
-dropout = 0.1
-
-# Names of position encoding table which will be initialized in external.
-pos_enc_param_names = ("src_pos_enc_table", "trg_pos_enc_table")
+class TrainTaskConfig(object):
+ use_gpu = False
+ # the epoch number to train.
+ pass_num = 2
+
+ # number of sequences contained in a mini-batch.
+ batch_size = 64
+
+ # the hyper params for Adam optimizer.
+ learning_rate = 0.001
+ beta1 = 0.9
+ beta2 = 0.98
+ eps = 1e-9
+
+
+class ModelHyperParams(object):
+ # Dictionary size for source and target language. This model directly uses
+ # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+ # alreay been added, but the <pad> token is not added. Transformer requires
+ # sequences in a mini-batch are padded to have the same length. A <pad> token is
+ # added into the original dictionary in paddle.dateset.wmt16.
+
+ # size of source word dictionary.
+ src_vocab_size = 10000
+ # index for <pad> token in source language.
+ src_pad_idx = src_vocab_size
+
+ # size of target word dictionay
+ trg_vocab_size = 10000
+ # index for <pad> token in target language.
+ trg_pad_idx = trg_vocab_size
+
+ # position value corresponding to the <pad> token.
+ pos_pad_idx = 0
+
+ # max length of sequences. It should plus 1 to include position
+ # padding token for position encoding.
+ max_length = 50
+
+ # the dimension for word embeddings, which is also the last dimension of
+ # the input and output of multi-head attention, position-wise feed-forward
+ # networks, encoder and decoder.
+
+ d_model = 512
+ # size of the hidden layer in position-wise feed-forward networks.
+ d_inner_hid = 1024
+ # the dimension that keys are projected to for dot-product attention.
+ d_key = 64
+ # the dimension that values are projected to for dot-product attention.
+ d_value = 64
+ # number of head used in multi-head attention.
+ n_head = 8
+ # number of sub-layers to be stacked in the encoder and decoder.
+ n_layer = 6
+ # dropout rate used by all dropout layers.
+ dropout = 0.1
+
+
+# Names of position encoding table which will be initialized externally.
+pos_enc_param_names = (
+ "src_pos_enc_table",
+ "trg_pos_enc_table", )
+
 # Names of all data layers listed in order.
-input_data_names = ("src_word", "src_pos", "trg_word", "trg_pos",
- "src_slf_attn_bias", "trg_slf_attn_bias",
- "trg_src_attn_bias", "lbl_word")
+input_data_names = (
+ "src_word",
+ "src_pos",
+ "trg_word",
+ "trg_pos",
+ "src_slf_attn_bias",
+ "trg_slf_attn_bias",
+ "trg_src_attn_bias",
+ "lbl_word", )