Skip to content
Prev Previous commit
Next Next commit
fixes after Martin's code review
  • Loading branch information
michnov committed Feb 19, 2021
commit dd8bb897b7bded6f8585d9593824526b221c34ea
18 changes: 6 additions & 12 deletions udapi/block/tokenize/onwhitespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class OnWhitespace(Block):
preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default True)
"""

escape_whitespace_table = str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'})

def __init__(self, normalize_spaces=True, **kwargs):
super().__init__(**kwargs)
self.normalize_spaces = normalize_spaces
Expand All @@ -39,14 +41,6 @@ def tokenize_sentence(string):
"""A method to be overriden in subclasses."""
return string.split()

@staticmethod
def escape_whitespace(string):
string = re.sub(r' ', r'\\s', string)
string = re.sub(r'\t', r'\\t', string)
string = re.sub(r'\r', r'\\r', string)
string = re.sub(r'\n', r'\\n', string)
return string

def process_tree(self, root):
if root.children:
raise ValueError('Tree %s is already tokenized.' % root)
Expand Down Expand Up @@ -95,9 +89,9 @@ def process_tree(self, root):
node = root.create_child(form=token)
node.ord = i

if i == 1 and len(spaces_before) > 0:
node.misc["SpacesBefore"] = self.escape_whitespace(spaces_before)
if not len(spaces_after):
if i == 1 and spaces_before:
node.misc["SpacesBefore"] = spaces_before.translate(escape_whitespace_table)
if not spaces_after:
node.misc["SpaceAfter"] = 'No'
elif spaces_after != " ":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, escape_whitespace_table could be a module-level constant, instead of class-level, but I have no strong preference here, so I will merge this now.

node.misc["SpacesAfter"] = self.escape_whitespace(spaces_after)
node.misc["SpacesAfter"] = spaces_after.translate(escape_whitespace_table)