fastnlp
diff --git a/‎pretrain/README.md‎
Lines changed: 12 additions & 3 deletions b/‎pretrain/README.md‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎pretrain/megatron/model/bart_model.py‎
Lines changed: 1 addition & 2 deletions b/‎pretrain/megatron/model/bart_model.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎pretrain/megatron/model/cpt_model.py‎
Lines changed: 4 additions & 9 deletions b/‎pretrain/megatron/model/cpt_model.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎pretrain/megatron/tokenizer/tokenizer.py‎
Lines changed: 4 additions & 4 deletions b/‎pretrain/megatron/tokenizer/tokenizer.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pretrain/pretrain_cpt.py‎
Lines changed: 17 additions & 14 deletions b/‎pretrain/pretrain_cpt.py‎
Lines changed: 17 additions & 14 deletions
diff --git a/‎pretrain/run_pretrain_bart.sh‎
Lines changed: 3 additions & 3 deletions b/‎pretrain/run_pretrain_bart.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pretrain/run_pretrain_cpt.sh‎
Lines changed: 11 additions & 11 deletions b/‎pretrain/run_pretrain_cpt.sh‎
Lines changed: 11 additions & 11 deletions
@@ -2,13 +2,22 @@
 
 The code of pre-training CPT is based on [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). 
 
-For **Setup**, **Data Processing** and **Training** of CPT, you can refer to the [README](README_megatron.md) of Megatron-LM. And the package [jieba_fast](https://github.com/deepcs233/jieba_fast) is needed for Whole Word Masking pre-training.
+For **Setup**, **Data Processing** of CPT, you can refer to the [README](README_megatron.md) of Megatron-LM. And the package [jieba_fast](https://github.com/deepcs233/jieba_fast) is needed for Whole Word Masking pre-training.
 
-After processing the data, place the `.bin` and `.idx` files into `./dataset/`. And place vocab files into `vocab/bert_zh_vocab/`. Then, use the scripts `run_pretrain_bart.sh` and `run_pretrain_cpt.sh` to train Chinese BART and CPT, respectively.
+## Training
+Firstly, prepare files in the following folders：
+- `dataset/`: Place the `.bin` and `.idx` files that preprocessed from raw text.
+- `vocab/`: Place the vocab files and model config file.
+- `roberta_zh/`: Place the checkpoint of Chinese RoBERTa, as the CPT initialize the encoder from the checkpoint.
+
+Then, use the scripts `run_pretrain_bart.sh` and `run_pretrain_cpt.sh` to train Chinese BART and CPT, respectively.
+
+
+*NOTE: the training scripts is distributed examples for 8 GPUs. You may alter the number of GPUs and change the training steps to meet the need.*
 
 ## Main Changes
 - Add `bart_model` and `cpt_model` for Megatron under `megatron/model`, to let Megatron can train on BART and CPT.
-- Add `_HfAutoTokenizer` in `megatron/tokenizer/tokenizer.py` to let Megatron can use Tokenizers from Huggingface-Transformers. 
+- Add `_HfBertTokenizer` in `megatron/tokenizer/tokenizer.py` to let Megatron can use Tokenizers from Huggingface-Transformers. 
 - Add `bart_dataset` and `cpt_dataset` under `megatron/data` to produce data for Whole Word Masking (WWM) and Denoising Auto-Encoder (DAE) pre-training.
 - Add `tools/convert_ckpt.py` to convert Megatron checkpoints to Huggingface-Transformers format.
 - Add `tools/preprocess_data.py` to preprocess and chunk large amount of text data into binary format used in Megatron.
@@ -25,8 +25,7 @@ def __init__(self):
 
  self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
 
- # config = BartConfig.from_pretrained(args.vocab_file) # vocab file path also contains config.json
- config = BartConfig.from_pretrained('vocab/bart_zh_vocab')
+ config = BartConfig.from_pretrained(args.vocab_file) # vocab file path also contains config.json
  # encoder_config = BertConfig.from_pretrained(model_path)
  tokenizer = get_tokenizer()
  config.vocab_size = tokenizer.vocab_size
 
@@ -14,24 +14,19 @@
 from megatron.model.utils import init_method_normal
 from megatron.model.utils import scaled_init_method_normal
 from .module import MegatronModule
-# from transformers import BartForConditionalGeneration as HFBartModel
-# from megatron.model.modeling_bart import BartForConditionalGeneration as HFBartModel
-from megatron.model.modeling_cpt import BartForConditionalGeneration as HFBartModel
+from megatron.model.modeling_cpt import CPTForConditionalGeneration as HFBartModel
 from transformers import BertConfig, BartConfig
 
-class BartModel(MegatronModule):
+class CPTModel(MegatronModule):
  def __init__(self):
  super().__init__()
  args = get_args()
 
  self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
 
  # config = BartConfig.from_pretrained(args.vocab_file) # vocab file path also contains config.json
- if args.num_layers > 12:
- model_path = 'roberta-zh/large'
- else:
- model_path = 'roberta-zh/base'
- config = BartConfig.from_pretrained('vocab/bart_zh_vocab')
+ model_path = 'roberta_zh'
+ config = BartConfig.from_pretrained(args.vocab_file) # vocab file path also contains config.json
  # encoder_config = BertConfig.from_pretrained(model_path)
  tokenizer = get_tokenizer()
  config.vocab_size = tokenizer.vocab_size
 
@@ -20,7 +20,7 @@
 
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
 from .gpt2_tokenization import GPT2Tokenizer
-from transformers import AutoTokenizer
+from transformers import BertTokenizer
 
 def build_tokenizer(args):
  """Initialize tokenizer."""
@@ -42,7 +42,7 @@ def build_tokenizer(args):
  assert args.merge_file is not None
  tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
  elif args.tokenizer_type == 'Huggingface':
- tokenizer = _HfAutoTokenizer(args.vocab_file)
+ tokenizer = _HfBertTokenizer(args.vocab_file)
  else:
  raise NotImplementedError('{} tokenizer is not '
  'implemented.'.format(args.tokenizer_type))
@@ -128,9 +128,9 @@ def mask(self):
  'tokenizer'.format(self.name))
 
 
-class _HfAutoTokenizer(AbstractTokenizer):
+class _HfBertTokenizer(AbstractTokenizer):
  def __init__(self, from_pretrained_path):
- self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained_path)
+ self.tokenizer = BertTokenizer.from_pretrained(from_pretrained_path)
  self.tokenizer_type = self.tokenizer.__class__.__name__
  self._inv_vocab = {i:t for t,i in self.tokenizer.get_vocab().items()}
  super().__init__('Huggingface Tokenizer {}'.format(self.tokenizer_type))
 
@@ -27,7 +27,7 @@
  get_tokenizer
 )
 from megatron.data.dataset_utils import build_train_valid_test_datasets
-from megatron.model.cpt_model import BartModel
+from megatron.model.cpt_model import CPTModel
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group
 
@@ -37,15 +37,16 @@ def model_provider(pre_process=True, post_process=True):
  assert pre_process and post_process, "BART doesn't yet support pipelining"
 
  print_rank_0('building BART model ...')
- model = BartModel()
+ model = CPTModel()
  print_rank_0(model)
  return model
 
 
+SHOW_DATA = False
 def get_batch(data_iterator):
  """Build the batch."""
 
- keys = ['source', 'target', 'prev_output_tokens', 'pos1', 'pos2', 'attn_mask', 'loss_mask']
+ keys = ['source', 'target', 'prev_output_tokens', 'attn_mask', 'loss_mask', 'use_decoder']
  datatype = torch.int64
 
  # Broadcast data.
@@ -59,17 +60,19 @@ def get_batch(data_iterator):
  source = data_b['source'].long()
  target = data_b['target'].long()
  prev_output_tokens = data_b['prev_output_tokens'].long()
- pos1 = data_b['pos1'].long()
- pos2 = data_b['pos2'].long()
  attn_mask = data_b['attn_mask'].long()
  loss_mask = data_b['loss_mask'].float()
- # print('source', source[0])
- # print('target', target[0])
- # tokenizer = get_tokenizer()
- # print('source', tokenizer.detokenize(source[0]))
- # print('target', tokenizer.detokenize(target[0]))
- # import pdb; pdb.set_trace()
- return source, target, prev_output_tokens, pos1, pos2, attn_mask, loss_mask
+ use_decoder = data_b['use_decoder'].long()
+
+ global SHOW_DATA
+ if not SHOW_DATA:
+ SHOW_DATA = True
+ print_rank_0('source: {}'.format(source[0]))
+ print_rank_0('target: {}'.format(target[0]))
+ tokenizer = get_tokenizer()
+ print_rank_0('source: {}'.format(tokenizer.detokenize(source[0])))
+ print_rank_0('target: {}'.format(tokenizer.detokenize(target[0])))
+ return source, target, prev_output_tokens, attn_mask, loss_mask, use_decoder
 
 
 def loss_func(loss_mask, output_tensor):
@@ -94,11 +97,11 @@ def forward_step(data_iterator, model):
 
  # Get the batch.
  timers('batch-generator').start()
- source, target, prev_output_tokens, pos1, pos2, attn_mask, loss_mask = get_batch(data_iterator)
+ source, target, prev_output_tokens, attn_mask, loss_mask, use_decoder = get_batch(data_iterator)
  timers('batch-generator').stop()
 
  # Forward model lm_labels
- output_tensor = model(source, attn_mask, prev_output_tokens, pos1, pos2, target)
+ output_tensor = model(source, attn_mask, prev_output_tokens, target, use_decoder)
 
  return output_tensor, partial(loss_func, loss_mask)
 
 
@@ -10,7 +10,7 @@ WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 
 DATA_PATH="dataset/"
 CHECKPOINT_PATH=checkpoints/bart-base
-VOCAB_FILE=vocab/bert_zh_vocab/
+VOCAB_FILE=vocab/
 
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 
@@ -39,11 +39,11 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  --initial-loss-scale 65536 \
  --weight-decay 1e-2 \
  --clip-grad 1.0 \
- --lr-warmup-fraction .032 \
+ --lr-warmup-fraction .01 \
  --log-interval 1 \
  --save-interval 1600 \
  --eval-interval 500 \
- --eval-iters 1 \
+ --eval-iters 10 \
  --fp16 \
  --optimizer adam \
  --num-workers 2 \
 
@@ -10,7 +10,7 @@ WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 
 DATA_PATH="dataset/"
 CHECKPOINT_PATH=checkpoints/cpt-base
-VOCAB_FILE=vocab/bert_zh_vocab/
+VOCAB_FILE=vocab/
 
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 
@@ -20,14 +20,13 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  --num-decoder-layers 2 \
  --hidden-size 768 \
  --num-attention-heads 12 \
- --micro-batch-size 32 \
- --global-batch-size 512 \
+ --micro-batch-size 16 \
+ --global-batch-size 256 \
  --seq-length 512 \
  --max-position-embeddings 512 \
  --mask-prob 0.15 \
- --train-iters 1000000 \
- --lr-decay-iters 1000000 \
- --lr-warmup-fraction .01 \
+ --train-iters 100000 \
+ --lr-decay-iters 100000 \
  --save $CHECKPOINT_PATH \
  --load $CHECKPOINT_PATH \
  --data-path $DATA_PATH \
@@ -36,16 +35,17 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  --split 949,30,1 \
  --distributed-backend nccl \
  --lr 1e-4 \
- --lr-encoder 5e-5 \
  --lr-decay-style cosine \
  --min-lr 1e-6 \
+ --initial-loss-scale 65536 \
  --weight-decay 1e-2 \
  --clip-grad 1.0 \
- --initial-loss-scale 65536 \
- --log-interval 10 \
- --save-interval 10000 \
+ --lr-warmup-fraction .01 \
+ --log-interval 1 \
+ --save-interval 1600 \
  --eval-interval 500 \
  --eval-iters 10 \
- --num-workers 2 \
  --fp16 \
+ --optimizer adam \
+ --num-workers 2 \
  # --checkpoint-activations