|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +''' |
| 3 | +Word2word |
| 4 | +authors: Kyubyong Park (kbpark.linguist@gmail.com), YJ Choe (yjchoe33@gmail.com), Dongwoo Kim (kimdwkimdw@gmail.com) |
| 5 | +
|
| 6 | +''' |
| 7 | +import codecs |
| 8 | +import os |
| 9 | +import re |
| 10 | +import pickle |
| 11 | +import operator |
| 12 | +from collections import Counter |
| 13 | +from itertools import chain |
| 14 | +from tqdm import tqdm |
| 15 | +import argparse |
| 16 | +import logging |
| 17 | +from utils import get_savedir |
| 18 | + |
| 19 | + |
| 20 | +def download(lang1, lang2): |
| 21 | + '''Download corpora from Opensubtitles 2018''' |
| 22 | + download = f"wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/moses/{lang1}-{lang2}.txt.zip -P data" |
| 23 | + unzip = "unzip data/*.zip -d data/" |
| 24 | + rm_zip = "rm data/*.zip" |
| 25 | + rm_ids = "rm data/*.ids" |
| 26 | + rm_readme = "rm README*" |
| 27 | + for cmd in (download, unzip, rm_zip, rm_ids, rm_readme): |
| 28 | + os.system(cmd) |
| 29 | + |
| 30 | +def normalize(tokens, ignore_first_word): |
| 31 | + '''If ignore_firs_word is True, |
| 32 | + We drop the first word or token |
| 33 | + because its true case is unclear.''' |
| 34 | + if ignore_first_word: |
| 35 | + tokens = tokens[1:] |
| 36 | + return tokens |
| 37 | + |
| 38 | +def word_segment(sent, lang, tokenizer): |
| 39 | + if lang=="en": |
| 40 | + words = tokenizer(sent) |
| 41 | + elif lang == 'ko': |
| 42 | + words = [word for word, _ in tokenizer.pos(sent)] |
| 43 | + elif lang=='ja': |
| 44 | + words = [elem for elem in tokenizer.getWS(sent)] |
| 45 | + elif lang=='th': |
| 46 | + words = tokenizer(sent, engine='mm') |
| 47 | + elif lang=='vi': |
| 48 | + words = tokenizer.tokenize(sent).split() |
| 49 | + elif lang=='zh_cn': |
| 50 | + words = [elem for elem in tokenizer.getWS(sent)] |
| 51 | + elif lang=="zh_tw": |
| 52 | + words = list(tokenizer.cut(sent, cut_all=False)) |
| 53 | + elif lang=="ar": |
| 54 | + words = tokenizer.tokenize(sent) |
| 55 | + else: # Mostly european languages |
| 56 | + sent = re.sub("([!.?,])", r" \1", sent) |
| 57 | + words = sent.split() |
| 58 | + |
| 59 | + return words |
| 60 | + |
| 61 | + |
| 62 | +def refine(fin, lang, max_lines, tokenizer, ignore_first_word): |
| 63 | + lines = codecs.open(fin, 'r', 'utf-8').read().split("\n") |
| 64 | + lines = lines[:max_lines] |
| 65 | + sents = [normalize(word_segment(sent, lang, tokenizer), lang, ignore_first_word) for sent in tqdm(lines)] |
| 66 | + return sents |
| 67 | + |
| 68 | + |
| 69 | +def create_conversion_dicts(sents, n_lexicon): |
| 70 | + word2idx, idx2word, idx2cnt = dict(), dict(), dict() |
| 71 | + word2cnt = Counter(tqdm(list(chain.from_iterable(sents)))) |
| 72 | + for idx, (word, cnt) in enumerate(word2cnt.most_common(n_lexicon)): |
| 73 | + word2idx[word] = idx |
| 74 | + idx2word[idx] = word |
| 75 | + idx2cnt[idx] = cnt |
| 76 | + |
| 77 | + return word2idx, idx2word, idx2cnt |
| 78 | + |
| 79 | +def update_monolingual_dict(xs, x2xs, cutoff): |
| 80 | + for x in xs: |
| 81 | + for _x in xs: # _x: collocate |
| 82 | + if x == _x: continue |
| 83 | + if _x > cutoff: continue # Cut off infrequent words to save memory |
| 84 | + if x not in x2xs: x2xs[x] = dict() |
| 85 | + if _x not in x2xs[x]: x2xs[x][_x] = 0 |
| 86 | + x2xs[x][_x] += 1 |
| 87 | + return x2xs |
| 88 | + |
| 89 | + |
| 90 | +def adjust_dict(x2ys, x2cnt, x2xs, reranking_width, n_trans): |
| 91 | + _x2ys = dict() |
| 92 | + for x, ys in tqdm(x2ys.items()): |
| 93 | + if x not in x2xs: continue # if there's no collocates, we don't have to adjust the score. |
| 94 | + cntx = x2cnt[x] |
| 95 | + y_scores = [] |
| 96 | + for y, cnty in sorted(ys.items(), key=operator.itemgetter(1), reverse=True)[:reranking_width]: |
| 97 | + ts = cnty / float(cntx) # translation score: initial value |
| 98 | + for x2, cntx2 in x2xs[x].items(): # Collocates |
| 99 | + p_x_x2 = cntx2 / float(cntx) |
| 100 | + p_x2_y2 = 0 |
| 101 | + if x2 in x2ys: |
| 102 | + p_x2_y2 = x2ys[x2].get(y, 0) / float(x2cnt[x2]) |
| 103 | + ts -= (p_x_x2 * p_x2_y2) |
| 104 | + y_scores.append((y, ts)) |
| 105 | + _ys = sorted(y_scores, key=lambda x: x[1], reverse=True)[:n_trans] |
| 106 | + _ys = [each[0] for each in _ys] |
| 107 | + _x2ys[x] = _ys |
| 108 | + |
| 109 | + return _x2ys |
| 110 | + |
| 111 | +def load_tokenizer(lang): |
| 112 | + if lang=="en": |
| 113 | + from nltk.tokenize import word_tokenize as wt |
| 114 | + tokenizer = wt |
| 115 | + elif lang=="ko": |
| 116 | + from konlpy.tag import Kkma |
| 117 | + tokenizer = Kkma() |
| 118 | + elif lang=="ja": |
| 119 | + import Mykytea |
| 120 | + opt="-model jp-0.4.7-1.mod" |
| 121 | + tokenizer = Mykytea.Mykytea(opt) |
| 122 | + elif lang=="zh_cn": |
| 123 | + import Mykytea |
| 124 | + opt = "-model ctb-0.4.0-1.mod" |
| 125 | + tokenizer = Mykytea.Mykytea(opt) |
| 126 | + elif lang=="zh_tw": |
| 127 | + import jieba |
| 128 | + tokenizer = jieba |
| 129 | + elif lang=="vi": |
| 130 | + from pyvi import ViTokenizer |
| 131 | + tokenizer = ViTokenizer |
| 132 | + elif lang=="th": |
| 133 | + from pythainlp.tokenize import word_tokenize |
| 134 | + tokenizer = word_tokenize |
| 135 | + elif lang=="ar": |
| 136 | + import pyarabic.araby as araby |
| 137 | + tokenizer = araby |
| 138 | + else: |
| 139 | + tokenizer = None |
| 140 | + |
| 141 | + return tokenizer |
| 142 | + |
| 143 | + |
| 144 | +# def sanity_check(word2x, x2ys, _x2ys, y2word, reranking_width): |
| 145 | +# if "time" not in word2x: return "" |
| 146 | +# time_id = word2x["time"] |
| 147 | +# |
| 148 | +# # before adjustment |
| 149 | +# ys = x2ys[time_id] |
| 150 | +# y_cnt = sorted(ys.items(), key=operator.itemgetter(1), reverse=True)[:reranking_width] |
| 151 | +# print("\tbefore adjustment the translations of `time` were =>", " | ".join(y2word[y] for y, cnt in y_cnt)) |
| 152 | +# |
| 153 | +# # after adjustment |
| 154 | +# ys = _x2ys[time_id] |
| 155 | +# print("\tafter adjustment the translations of `time` are => ", " | ".join(y2word[y] for y in ys)) |
| 156 | + |
| 157 | +def main(hp): |
| 158 | + logging.info("Step 0. Download ..") |
| 159 | + lang1, lang2 = sorted([hp.lang1, hp.lang2]) |
| 160 | + download(lang1, lang2) |
| 161 | + |
| 162 | + logging.info("Step 1. Load tokenizer ..") |
| 163 | + tokenizer1 = load_tokenizer(lang1) |
| 164 | + tokenizer2 = load_tokenizer(lang2) |
| 165 | + |
| 166 | + logging.info("Step 2. Normalize sentences ..") |
| 167 | + logging.info(f"Working on {lang1} ..") |
| 168 | + fin = f'data/OpenSubtitles.{lang1}-{lang2}.{lang1}' |
| 169 | + sents1 = refine(fin, lang1, hp.max_lines, tokenizer1, hp.ignore_first_word1) |
| 170 | + |
| 171 | + logging.info(f"Working on {lang2} ..") |
| 172 | + fin = f'data/OpenSubtitles.{lang1}-{lang2}.{lang2}' |
| 173 | + sents2 = refine(fin, lang2, hp.max_lines, tokenizer2, hp.ignore_first_word2) |
| 174 | + |
| 175 | + assert len(sents1) == len(sents2), \ |
| 176 | + f"""{lang1} and {lang2} MUST be the same in length.\n |
| 177 | + {lang1} has {len(sents1)} lines, but {lang2} has {len(sents2)} lines""" |
| 178 | + |
| 179 | + # Create folder |
| 180 | + savedir = get_savedir() |
| 181 | + os.makedirs(savedir, exist_ok=True) |
| 182 | + |
| 183 | + print("Step 3. Initialize dictionaries") |
| 184 | + # conversion dictionaries |
| 185 | + word2x, x2word, x2cnt = create_conversion_dicts(sents1, hp.n_lexicon) |
| 186 | + word2y, y2word, y2cnt = create_conversion_dicts(sents2, hp.n_lexicon) |
| 187 | + |
| 188 | + # monolingual collocation dictionaries |
| 189 | + x2xs = dict() # {x: {x1: cnt, x2: cnt, ...}} |
| 190 | + y2ys = dict() # {y: {y1: cnt, y2: cnt, ...}} |
| 191 | + |
| 192 | + # crosslingual collocation dictionaries |
| 193 | + x2ys = dict() # {x: {y1: cnt, y2: cnt, ...}} |
| 194 | + y2xs = dict() # {y: {x1: cnt, x2: cnt, ...}} |
| 195 | + |
| 196 | + print("Step 4. Update dictionaries ...") |
| 197 | + line_num = 1 |
| 198 | + for sent1, sent2 in tqdm(zip(sents1, sents2), total=len(sents1)): |
| 199 | + if len(sent1) <= 1 or len(sent2) <= 1: continue |
| 200 | + |
| 201 | + # To indices |
| 202 | + xs = [word2x[word] for word in sent1 if word in word2x] |
| 203 | + ys = [word2y[word] for word in sent2 if word in word2y] |
| 204 | + |
| 205 | + # Monolingual dictionary updates |
| 206 | + x2xs = update_monolingual_dict(xs, x2xs, hp.cutoff) |
| 207 | + y2ys = update_monolingual_dict(ys, y2ys, hp.cutoff) |
| 208 | + |
| 209 | + # Crosslingual dictionary updates |
| 210 | + for x in xs: |
| 211 | + for y in ys: |
| 212 | + if line_num <= hp.lexicon_lines: |
| 213 | + ## lang1 -> lang2 |
| 214 | + if x not in x2ys: x2ys[x] = dict() |
| 215 | + if y not in x2ys[x]: x2ys[x][y] = 0 |
| 216 | + x2ys[x][y] += 1 |
| 217 | + |
| 218 | + ## lang2 -> lang1 |
| 219 | + if y not in y2xs: y2xs[y] = dict() |
| 220 | + if x not in y2xs[y]: y2xs[y][x] = 0 |
| 221 | + y2xs[y][x] += 1 |
| 222 | + |
| 223 | + else: # We don't add new words after some point to save memory. |
| 224 | + ## lang1 -> lang2 |
| 225 | + if x in x2ys and y in x2ys[x] and x2ys[x][y] > 1: |
| 226 | + x2ys[x][y] += 1 |
| 227 | + |
| 228 | + ## lang2 -> lang1 |
| 229 | + if y in y2xs and x in y2xs[y] and y2xs[y][x] > 1: |
| 230 | + y2xs[y][x] += 1 |
| 231 | + line_num += 1 |
| 232 | + |
| 233 | + print("Step 5. Adjust ...") |
| 234 | + _x2ys = adjust_dict(x2ys, x2cnt, x2xs, hp.reranking_width, hp.n_trans) |
| 235 | + _y2xs = adjust_dict(y2xs, y2cnt, y2ys, hp.reranking_width, hp.n_trans) |
| 236 | + |
| 237 | + # print("Step 5. Sanity check") |
| 238 | + # if lang1 == "en": |
| 239 | + # sanity_check(word2x, x2ys, _x2ys, y2word, hp.reranking_width) |
| 240 | + # elif lang2 == "en": |
| 241 | + # sanity_check(word2y, y2xs, _y2xs, x2word, hp.reranking_width) |
| 242 | + # else: |
| 243 | + # pass |
| 244 | + |
| 245 | + print("Step 6. Save") |
| 246 | + pickle.dump((word2x, y2word, _x2ys), open(f'{savedir}/{lang1}-{lang2}.pkl', 'wb')) |
| 247 | + pickle.dump((word2y, x2word, _y2xs), open(f'{savedir}/{lang2}-{lang1}.pkl', 'wb')) |
| 248 | + |
| 249 | + print("Done!") |
| 250 | + |
| 251 | +if __name__ == "__main__": |
| 252 | + # arguments setting |
| 253 | + parser = argparse.ArgumentParser() |
| 254 | + parser.add_argument('--lang1', type=str, required=True, |
| 255 | + help="ISO 639-1 code of language. See `http://opus.lingfil.uu.se/OpenSubtitles2016.php`") |
| 256 | + parser.add_argument('--lang2', type=str, required=True, |
| 257 | + help="ISO 639-1 code of language. See `http://opus.lingfil.uu.se/OpenSubtitles2016.php`") |
| 258 | + parser.add_argument('--max_lines', type=int, default=1000000, help="maximum number of lines that are used") |
| 259 | + parser.add_argument('--ignore_first_word1', dest="ignore_first_word1", action="store_true", |
| 260 | + help="Ignore first words in the source lang because we don't know the true case of them.") |
| 261 | + parser.add_argument('--ignore_first_word2', dest="ignore_first_word2", action="store_true", |
| 262 | + help="Ignore first words in the target lang because we don't know the true case of them.") |
| 263 | + parser.add_argument('--cutoff', type=int, default=1000, |
| 264 | + help="number of words that are used in calculating collocation") |
| 265 | + parser.add_argument('--lexicon_lines', type=int, default=100000, |
| 266 | + help="New words are not added after some point to save memory") |
| 267 | + parser.add_argument('--n_lexicon', type=int, default=100000, |
| 268 | + help="number fo words in lexicon") |
| 269 | + parser.add_argument('--reranking_width', default=100, |
| 270 | + help="maximum collocates that we consider when reranking them") |
| 271 | + parser.add_argument('--n_trans', type=int, default=10, |
| 272 | + help="number of final translations") |
| 273 | + hp = parser.parse_args() |
| 274 | + |
| 275 | + main(hp) |
| 276 | + print("Done!") |
0 commit comments