Skip to content

Commit d5f7faa

Browse files
committed
Create make.py
1 parent 8b07e77 commit d5f7faa

File tree

1 file changed

+276
-0
lines changed

1 file changed

+276
-0
lines changed

Projects/Word2Word/make.py

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
# -*- coding: utf-8 -*-
2+
'''
3+
Word2word
4+
authors: Kyubyong Park (kbpark.linguist@gmail.com), YJ Choe (yjchoe33@gmail.com), Dongwoo Kim (kimdwkimdw@gmail.com)
5+
6+
'''
7+
import codecs
8+
import os
9+
import re
10+
import pickle
11+
import operator
12+
from collections import Counter
13+
from itertools import chain
14+
from tqdm import tqdm
15+
import argparse
16+
import logging
17+
from utils import get_savedir
18+
19+
20+
def download(lang1, lang2):
21+
'''Download corpora from Opensubtitles 2018'''
22+
download = f"wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/moses/{lang1}-{lang2}.txt.zip -P data"
23+
unzip = "unzip data/*.zip -d data/"
24+
rm_zip = "rm data/*.zip"
25+
rm_ids = "rm data/*.ids"
26+
rm_readme = "rm README*"
27+
for cmd in (download, unzip, rm_zip, rm_ids, rm_readme):
28+
os.system(cmd)
29+
30+
def normalize(tokens, ignore_first_word):
31+
'''If ignore_firs_word is True,
32+
We drop the first word or token
33+
because its true case is unclear.'''
34+
if ignore_first_word:
35+
tokens = tokens[1:]
36+
return tokens
37+
38+
def word_segment(sent, lang, tokenizer):
39+
if lang=="en":
40+
words = tokenizer(sent)
41+
elif lang == 'ko':
42+
words = [word for word, _ in tokenizer.pos(sent)]
43+
elif lang=='ja':
44+
words = [elem for elem in tokenizer.getWS(sent)]
45+
elif lang=='th':
46+
words = tokenizer(sent, engine='mm')
47+
elif lang=='vi':
48+
words = tokenizer.tokenize(sent).split()
49+
elif lang=='zh_cn':
50+
words = [elem for elem in tokenizer.getWS(sent)]
51+
elif lang=="zh_tw":
52+
words = list(tokenizer.cut(sent, cut_all=False))
53+
elif lang=="ar":
54+
words = tokenizer.tokenize(sent)
55+
else: # Mostly european languages
56+
sent = re.sub("([!.?,])", r" \1", sent)
57+
words = sent.split()
58+
59+
return words
60+
61+
62+
def refine(fin, lang, max_lines, tokenizer, ignore_first_word):
63+
lines = codecs.open(fin, 'r', 'utf-8').read().split("\n")
64+
lines = lines[:max_lines]
65+
sents = [normalize(word_segment(sent, lang, tokenizer), lang, ignore_first_word) for sent in tqdm(lines)]
66+
return sents
67+
68+
69+
def create_conversion_dicts(sents, n_lexicon):
70+
word2idx, idx2word, idx2cnt = dict(), dict(), dict()
71+
word2cnt = Counter(tqdm(list(chain.from_iterable(sents))))
72+
for idx, (word, cnt) in enumerate(word2cnt.most_common(n_lexicon)):
73+
word2idx[word] = idx
74+
idx2word[idx] = word
75+
idx2cnt[idx] = cnt
76+
77+
return word2idx, idx2word, idx2cnt
78+
79+
def update_monolingual_dict(xs, x2xs, cutoff):
80+
for x in xs:
81+
for _x in xs: # _x: collocate
82+
if x == _x: continue
83+
if _x > cutoff: continue # Cut off infrequent words to save memory
84+
if x not in x2xs: x2xs[x] = dict()
85+
if _x not in x2xs[x]: x2xs[x][_x] = 0
86+
x2xs[x][_x] += 1
87+
return x2xs
88+
89+
90+
def adjust_dict(x2ys, x2cnt, x2xs, reranking_width, n_trans):
91+
_x2ys = dict()
92+
for x, ys in tqdm(x2ys.items()):
93+
if x not in x2xs: continue # if there's no collocates, we don't have to adjust the score.
94+
cntx = x2cnt[x]
95+
y_scores = []
96+
for y, cnty in sorted(ys.items(), key=operator.itemgetter(1), reverse=True)[:reranking_width]:
97+
ts = cnty / float(cntx) # translation score: initial value
98+
for x2, cntx2 in x2xs[x].items(): # Collocates
99+
p_x_x2 = cntx2 / float(cntx)
100+
p_x2_y2 = 0
101+
if x2 in x2ys:
102+
p_x2_y2 = x2ys[x2].get(y, 0) / float(x2cnt[x2])
103+
ts -= (p_x_x2 * p_x2_y2)
104+
y_scores.append((y, ts))
105+
_ys = sorted(y_scores, key=lambda x: x[1], reverse=True)[:n_trans]
106+
_ys = [each[0] for each in _ys]
107+
_x2ys[x] = _ys
108+
109+
return _x2ys
110+
111+
def load_tokenizer(lang):
112+
if lang=="en":
113+
from nltk.tokenize import word_tokenize as wt
114+
tokenizer = wt
115+
elif lang=="ko":
116+
from konlpy.tag import Kkma
117+
tokenizer = Kkma()
118+
elif lang=="ja":
119+
import Mykytea
120+
opt="-model jp-0.4.7-1.mod"
121+
tokenizer = Mykytea.Mykytea(opt)
122+
elif lang=="zh_cn":
123+
import Mykytea
124+
opt = "-model ctb-0.4.0-1.mod"
125+
tokenizer = Mykytea.Mykytea(opt)
126+
elif lang=="zh_tw":
127+
import jieba
128+
tokenizer = jieba
129+
elif lang=="vi":
130+
from pyvi import ViTokenizer
131+
tokenizer = ViTokenizer
132+
elif lang=="th":
133+
from pythainlp.tokenize import word_tokenize
134+
tokenizer = word_tokenize
135+
elif lang=="ar":
136+
import pyarabic.araby as araby
137+
tokenizer = araby
138+
else:
139+
tokenizer = None
140+
141+
return tokenizer
142+
143+
144+
# def sanity_check(word2x, x2ys, _x2ys, y2word, reranking_width):
145+
# if "time" not in word2x: return ""
146+
# time_id = word2x["time"]
147+
#
148+
# # before adjustment
149+
# ys = x2ys[time_id]
150+
# y_cnt = sorted(ys.items(), key=operator.itemgetter(1), reverse=True)[:reranking_width]
151+
# print("\tbefore adjustment the translations of `time` were =>", " | ".join(y2word[y] for y, cnt in y_cnt))
152+
#
153+
# # after adjustment
154+
# ys = _x2ys[time_id]
155+
# print("\tafter adjustment the translations of `time` are => ", " | ".join(y2word[y] for y in ys))
156+
157+
def main(hp):
158+
logging.info("Step 0. Download ..")
159+
lang1, lang2 = sorted([hp.lang1, hp.lang2])
160+
download(lang1, lang2)
161+
162+
logging.info("Step 1. Load tokenizer ..")
163+
tokenizer1 = load_tokenizer(lang1)
164+
tokenizer2 = load_tokenizer(lang2)
165+
166+
logging.info("Step 2. Normalize sentences ..")
167+
logging.info(f"Working on {lang1} ..")
168+
fin = f'data/OpenSubtitles.{lang1}-{lang2}.{lang1}'
169+
sents1 = refine(fin, lang1, hp.max_lines, tokenizer1, hp.ignore_first_word1)
170+
171+
logging.info(f"Working on {lang2} ..")
172+
fin = f'data/OpenSubtitles.{lang1}-{lang2}.{lang2}'
173+
sents2 = refine(fin, lang2, hp.max_lines, tokenizer2, hp.ignore_first_word2)
174+
175+
assert len(sents1) == len(sents2), \
176+
f"""{lang1} and {lang2} MUST be the same in length.\n
177+
{lang1} has {len(sents1)} lines, but {lang2} has {len(sents2)} lines"""
178+
179+
# Create folder
180+
savedir = get_savedir()
181+
os.makedirs(savedir, exist_ok=True)
182+
183+
print("Step 3. Initialize dictionaries")
184+
# conversion dictionaries
185+
word2x, x2word, x2cnt = create_conversion_dicts(sents1, hp.n_lexicon)
186+
word2y, y2word, y2cnt = create_conversion_dicts(sents2, hp.n_lexicon)
187+
188+
# monolingual collocation dictionaries
189+
x2xs = dict() # {x: {x1: cnt, x2: cnt, ...}}
190+
y2ys = dict() # {y: {y1: cnt, y2: cnt, ...}}
191+
192+
# crosslingual collocation dictionaries
193+
x2ys = dict() # {x: {y1: cnt, y2: cnt, ...}}
194+
y2xs = dict() # {y: {x1: cnt, x2: cnt, ...}}
195+
196+
print("Step 4. Update dictionaries ...")
197+
line_num = 1
198+
for sent1, sent2 in tqdm(zip(sents1, sents2), total=len(sents1)):
199+
if len(sent1) <= 1 or len(sent2) <= 1: continue
200+
201+
# To indices
202+
xs = [word2x[word] for word in sent1 if word in word2x]
203+
ys = [word2y[word] for word in sent2 if word in word2y]
204+
205+
# Monolingual dictionary updates
206+
x2xs = update_monolingual_dict(xs, x2xs, hp.cutoff)
207+
y2ys = update_monolingual_dict(ys, y2ys, hp.cutoff)
208+
209+
# Crosslingual dictionary updates
210+
for x in xs:
211+
for y in ys:
212+
if line_num <= hp.lexicon_lines:
213+
## lang1 -> lang2
214+
if x not in x2ys: x2ys[x] = dict()
215+
if y not in x2ys[x]: x2ys[x][y] = 0
216+
x2ys[x][y] += 1
217+
218+
## lang2 -> lang1
219+
if y not in y2xs: y2xs[y] = dict()
220+
if x not in y2xs[y]: y2xs[y][x] = 0
221+
y2xs[y][x] += 1
222+
223+
else: # We don't add new words after some point to save memory.
224+
## lang1 -> lang2
225+
if x in x2ys and y in x2ys[x] and x2ys[x][y] > 1:
226+
x2ys[x][y] += 1
227+
228+
## lang2 -> lang1
229+
if y in y2xs and x in y2xs[y] and y2xs[y][x] > 1:
230+
y2xs[y][x] += 1
231+
line_num += 1
232+
233+
print("Step 5. Adjust ...")
234+
_x2ys = adjust_dict(x2ys, x2cnt, x2xs, hp.reranking_width, hp.n_trans)
235+
_y2xs = adjust_dict(y2xs, y2cnt, y2ys, hp.reranking_width, hp.n_trans)
236+
237+
# print("Step 5. Sanity check")
238+
# if lang1 == "en":
239+
# sanity_check(word2x, x2ys, _x2ys, y2word, hp.reranking_width)
240+
# elif lang2 == "en":
241+
# sanity_check(word2y, y2xs, _y2xs, x2word, hp.reranking_width)
242+
# else:
243+
# pass
244+
245+
print("Step 6. Save")
246+
pickle.dump((word2x, y2word, _x2ys), open(f'{savedir}/{lang1}-{lang2}.pkl', 'wb'))
247+
pickle.dump((word2y, x2word, _y2xs), open(f'{savedir}/{lang2}-{lang1}.pkl', 'wb'))
248+
249+
print("Done!")
250+
251+
if __name__ == "__main__":
252+
# arguments setting
253+
parser = argparse.ArgumentParser()
254+
parser.add_argument('--lang1', type=str, required=True,
255+
help="ISO 639-1 code of language. See `http://opus.lingfil.uu.se/OpenSubtitles2016.php`")
256+
parser.add_argument('--lang2', type=str, required=True,
257+
help="ISO 639-1 code of language. See `http://opus.lingfil.uu.se/OpenSubtitles2016.php`")
258+
parser.add_argument('--max_lines', type=int, default=1000000, help="maximum number of lines that are used")
259+
parser.add_argument('--ignore_first_word1', dest="ignore_first_word1", action="store_true",
260+
help="Ignore first words in the source lang because we don't know the true case of them.")
261+
parser.add_argument('--ignore_first_word2', dest="ignore_first_word2", action="store_true",
262+
help="Ignore first words in the target lang because we don't know the true case of them.")
263+
parser.add_argument('--cutoff', type=int, default=1000,
264+
help="number of words that are used in calculating collocation")
265+
parser.add_argument('--lexicon_lines', type=int, default=100000,
266+
help="New words are not added after some point to save memory")
267+
parser.add_argument('--n_lexicon', type=int, default=100000,
268+
help="number fo words in lexicon")
269+
parser.add_argument('--reranking_width', default=100,
270+
help="maximum collocates that we consider when reranking them")
271+
parser.add_argument('--n_trans', type=int, default=10,
272+
help="number of final translations")
273+
hp = parser.parse_args()
274+
275+
main(hp)
276+
print("Done!")

0 commit comments

Comments
 (0)