DEV Community

Vee Satayamas
Vee Satayamas

Posted on

ลอง wordcutpy บน pypy3

3 มีนาคม 2562

พอทดสอบกับไฟล์ 11MB ใช้ pypy3 ทำให้ wordcutpy เร็วขึ้นเกิน 2 เท่า! คือใช้เวลาจาก 16 วินาที เหลือไม่ถึง 8 วินาที

(base) [vee@mint310 wiki]$ python3 wordcutpy.py 16598 (base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py 7833 (base) [vee@mint310 wiki]$ python3 wordcutpy.py 16093 (base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py 7821 (base) [vee@mint310 wiki]$ python3 wordcutpy.py 16272 (base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py 7810 ``` ` ` ```python # wordcutpy.py # การใช้ wordcutpy ที่ถูกต้องคือ copy & paste เลย ไม่ต้องใช้ pip 😅 # แล้วก็ copy bigthai.txt มาไว้ folder เดียวกัน import sys import re class PrefixTree(object): def __init__(self, members_with_payload): self.tab = {} if members_with_payload is None: return sorted_members_with_payload = sorted(members_with_payload, key=lambda i: i[0]) for i in range(len(sorted_members_with_payload)): members, payload = sorted_members_with_payload[i] row_no = 0 for j in range(len(members)): is_terminal = len(members) == j + 1 member = members[j] key = (row_no, j, member) if key in self.tab: row_no = self.tab[key][0] else: val = (i, is_terminal, payload if is_terminal else None) self.tab[key] = val row_no = i def lookup(self, i, offset, member): key = (i, offset, member) if key not in self.tab: return None return self.tab[key] UNK = 1 DICT = 2 INIT = 3 LATIN = 4 PUNC = 5 def is_better(link0, link1): if link0 is None: return True if link1["unk"] < link0["unk"]: return True if link1["w"] < link0["w"]: return True return False def build_path(dix, s): left_boundary = 0 dict_acc_list = [] path = [{"p":None, "w": 0, "unk": 0, "type": INIT}] latin_s = None latin_e = None punc_s = None punc_e = None for i, ch in enumerate(s): dict_acc_list.append({"s":i, "p":0, "final":False}) # Update dict acceptors _dict_acc_list = dict_acc_list dict_acc_list = [] for acc in _dict_acc_list: offset = i - acc["s"] child = dix.lookup(acc["p"], offset, ch) if child is not None: child_p, is_final, payload = child dict_acc_list.append({"s":acc["s"], "p": child_p, "final":is_final}) # latin words if latin_s is None: if re.match(u"[A-Za-z]", ch): latin_s = i if latin_s is not None: if re.match(u"[A-Za-z]", ch): if i + 1 == len(s) or re.match(u"[A-Za-z]", s[i + 1]): latin_e = i else: latin_s = None latin_e = None # puncuation if punc_s is None: if ch == " ": punc_s = i if punc_s is not None: if ch == " ": if len(s) == i + 1 or s[i + 1] != " ": punc_e = i else: punc_s = None punc_e = None # select link link = None # links from wordlist for acc in dict_acc_list: if acc["final"]: p_link = path[acc["s"]] _link = {"p": acc["s"], "w": p_link["w"] + 1, "unk": p_link["unk"], "type": DICT} if is_better(link, _link): link = _link # link from latin word if latin_s is not None and latin_e is not None: p_link = path[latin_s] _link = {"p": latin_s, "w": p_link["w"] + 1, "unk": p_link["unk"], "type": LATIN} if is_better(link, _link): link = _link # link from puncuation if punc_s is not None and punc_e is not None: p_link = path[punc_s] _link = {"p": punc_s, "w": p_link["w"] + 1, "unk": p_link["unk"], "type": PUNC} if is_better(link, _link): link = _link # fallback if link is None: p_link = path[left_boundary] link = {"p": left_boundary, "w": p_link["w"] + 1, "unk": p_link["unk"] + 1, "type": UNK} path.append(link) if link["type"] != UNK: left_boundary = i return path def path_to_tokens(txt, path): if len(path) < 2: return None e = len(path) - 1 toks = [] while True: link = path[e] s = link["p"] if s is None: break toks.append(txt[s:e]) e = s toks.reverse() return toks def tokenize(dix, txt): if txt is None or txt == "": return [] path = build_path(dix, txt) return path_to_tokens(txt, path) class Wordcut(object): def __init__(self, wordlist): self.dix = PrefixTree([(word, None) for word in wordlist]) @classmethod def bigthai(cls): import os "Initialize from bigthai" fileDir = os.path.dirname(__file__) filename = os.path.join(fileDir, 'bigthai.txt') with open(filename) as dict_file: word_list = list(set([w.rstrip() for w in dict_file.readlines()])) word_list.sort() return cls(word_list) def tokenize(self, s): return tokenize(self.dix, s) wordcut = Wordcut.bigthai() import time t1 = int(round(time.time() * 1000)) with open("wiki_plain_100k.txt") as fi: with open("wiki.cut", "w") as fo: for line in fi: line = line.strip() print(" ".join(wordcut.tokenize(line)), file=fo) t2 = int(round(time.time() * 1000)) print(t2-t1) # LICENSE: LGPLv3 ``` ` https://github.com/veer66/wordcutpy 
Enter fullscreen mode Exit fullscreen mode

Top comments (0)