Quick Start Python Only Colab

chopinml · April 17, 2021, 7:20pm

In quick start tutorial, there are many shell commands, for those who are not very familiar to terminal, or Windows users I thought python only version may be very useful.

Feel free to comment:

Here is the python code, on Windows pip is installing OpenNMT 1.2.0 version therefore added the exact latest version number. If you get torch version error, you can run the last line for 1.6.0 version otherwise not needed.

Creating a virtual environment is suggested.

Install the libraries:

pip install --upgrade pip pip install OpenNMT-py==2.1.0 # may not need pip install --ignore-installed torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

Save this python file as onmt-quickstart.py and execute
e.g. python onmt-quickstart.py or py onmt-quickstart.py in Windows

import requests import tarfile import sys import yaml from onmt.utils.parse import ArgumentParser from onmt.opts import dynamic_prepare_opts, train_opts, config_opts, translate_opts from onmt.bin.build_vocab import build_vocab_main from onmt.bin.train import train from onmt.bin.translate import translate def download_file_with_progress_bar(file_name, link): with open(file_name, "wb") as f: print("Downloading %s" % file_name) response = requests.get(link, stream=True) total_length = response.headers.get('content-length') if total_length is None: # no content length header f.write(response.content) else: dl = 0 total_length = int(total_length) for data in response.iter_content(chunk_size=4096): dl += len(data) f.write(data) done = int(50 * dl / total_length) sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) ) sys.stdout.write(" " + str(round(total_length/1024/1024,2)) + "M") sys.stdout.flush() sys.stdout.write("\n") def extract_tar_file(file_name): tar = tarfile.open(file_name, "r:gz") tar.extractall() tar.close() def print_first_x_lines_of_a_file(file_name, number_of_lines): a_file = open(file_name) for i in range(number_of_lines): line = a_file.readline() print(line) def create_yaml_for_vocabulary(file_name): yaml_config = """ ## Where the samples will be written save_data: toy-ende/run/example ## Where the vocab(s) will be written src_vocab: toy-ende/run/example.vocab.src tgt_vocab: toy-ende/run/example.vocab.tgt overwrite: true # Corpus opts: data: corpus: path_src: toy-ende/src-train.txt path_tgt: toy-ende/tgt-train.txt transforms: [] weight: 1 valid: path_src: toy-ende/src-val.txt path_tgt: toy-ende/tgt-val.txt transforms: [] """ config = yaml.safe_load(yaml_config) with open(file_name, "w") as f: f.write(yaml_config) def create_yaml_for_train(file_name, gpu_array): yaml_config = """ ## Where the samples will be written save_data: toy-ende/run/example ## Where the vocab(s) will be written src_vocab: toy-ende/run/example.vocab.src tgt_vocab: toy-ende/run/example.vocab.tgt overwrite: true # Corpus opts: data: corpus: path_src: toy-ende/src-train.txt path_tgt: toy-ende/tgt-train.txt transforms: [] weight: 1 valid: path_src: toy-ende/src-val.txt path_tgt: toy-ende/tgt-val.txt transforms: [] # Vocabulary files that were just created src_vocab: toy-ende/run/example.vocab.src tgt_vocab: toy-ende/run/example.vocab.tgt # Train on a single GPU world_size: 1 gpu_ranks: {} # Where to save the checkpoints save_model: toy-ende/run/model save_checkpoint_steps: 500 train_steps: 1000 valid_steps: 500 """.format(gpu_array) config = yaml.safe_load(yaml_config) with open(file_name, "w") as f: f.write(yaml_config) if __name__ == "__main__": # Downloading toy English to German data set. # Includes train, test and validate files for source and target language # src-train, tgt-train # src-test, tgt-test # src-val, tgt-val # === Download toy example tar.gz file and extract download_file_with_progress_bar( "toy-ende.tar.gz", "https://s3.amazonaws.com/opennmt-trainingdata/toy-ende.tar.gz" ) extract_tar_file("toy-ende.tar.gz") print_first_x_lines_of_a_file("toy-ende/src-train.txt", 3) # === Prepare build-vocab yaml file === create_yaml_for_vocabulary("toy-ende/build-vocab.yaml") # Build example.vocab.[src/tgt] files parser = ArgumentParser(description='build_vocab.py') dynamic_prepare_opts(parser, build_vocab_only=True) base_args = ( [ "-config", "toy-ende/build-vocab.yaml", "-n_sample", "10000" ] ) opts, unknown = parser.parse_known_args(base_args) build_vocab_main(opts) # === Prepare train yaml file === # If you want GPU support use second parameter like [0] [0,1] number of GPUS # [] means only CPU training create_yaml_for_train("toy-ende/train.yaml", "[0]") # === Start training === parser = ArgumentParser(description='train.py') train_opts(parser) base_args = ( [ "-config", "toy-ende/train.yaml" ] ) opts, unknown = parser.parse_known_args(base_args) train(opts) # === Translate results to pred_1000.txt === parser = ArgumentParser(description='translate.py') config_opts(parser) translate_opts(parser) base_args = ( [ "-model", "toy-ende/run/model_step_1000.pt", "-src", "toy-ende/src-test.txt", "-output", "toy-ende/pred_1000.txt", "-gpu", "0", #comment this line out for only CPU "-verbose" ] ) opts, unknown = parser.parse_known_args(base_args) translate(opts)