Skip to content
86 changes: 68 additions & 18 deletions udapi/block/tokenize/onwhitespace.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,40 @@
"""Block tokenize.OnWhitespace"""
import re
from udapi.core.block import Block


class OnWhitespace(Block):
""""Base tokenizer, splits on whitespaces, fills SpaceAfter=No."""
"""Base tokenizer, splits on whitespaces, fills SpaceAfter=No.

Use the parameter `normalize_spaces=False` to preserve all whitespaces in the sentence
in the UDPipe way, i.e. using the `SpacesAfter` and `SpacesBefore` features in the MISC field.
It is backward compatible with CoNLL-U v2 `SpaceAfter=No` feature. That is, no following
whitespace is marked by `SpaceAfter=No` and a single following space results in no
whitespace-related markup.
If loading the text using `read.Sentences` and all whitespaces need to be preserved
(in order to be able to reconstruct the original document), the `read.Sentences` block
must be called with `rstrip=\n` or `rstrip=\r\n` to prevent stripping the trailing
whitespace, e.g.::
$> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace normalize_spaces=0 write.Conllu

# sent_id = 1
# text = Hello world
1 Hello _ _ _ _ 0 _ _ SpacesAfter=\s\t\s
2 world _ _ _ _ 0 _ _ _
Note that the attribute `SpaceAfter=No` is missing for the token `world`, since it is
followed by a single space.

Parameters
----------
normalize_spaces : bool
preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default True)
"""

escape_whitespace_table = str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'})

def __init__(self, normalize_spaces=True, **kwargs):
super().__init__(**kwargs)
self.normalize_spaces = normalize_spaces

@staticmethod
def tokenize_sentence(string):
Expand All @@ -13,24 +44,23 @@ def tokenize_sentence(string):
def process_tree(self, root):
if root.children:
raise ValueError('Tree %s is already tokenized.' % root)
sentence = ' '.join(root.text.split())
#sentence = ' '.join(root.text.split())
sentence = root.text
tokens = self.tokenize_sentence(sentence)

# Check if there are any spaces before the first token
spaces_before = ""
m = re.match(r'\s+', sentence)
if m:
spaces_before = m.group(0)
sentence = sentence[len(spaces_before):]

for i, token in enumerate(tokens, 1):
space_after = False
spaces_after = ""

# Delete the token from the begining of the sentence.
if sentence.startswith(token):
sentence = sentence[len(token):]
# This is the expected case. The sentence starts with the token.
# If it is followed by a space, delete the space and set space_after=True.
if not len(sentence):
space_after = True
elif sentence.startswith(' '):
space_after = True
sentence = sentence[1:]
else:
# The token (returned from tokenization) does not match the start of sentence.
# E.g. '. . . word' is tokenized as '... word'.
# The token (returned from tokenization) does not match the start of sentence.
# E.g. '. . . word' is tokenized as '... word'.
if not sentence.startswith(token):
# Let's delete the start of sentence anyway,
# using a non-greedy regex and the expected next token
# returned from the tokenization.
Expand All @@ -40,8 +70,28 @@ def process_tree(self, root):
# $sentence = $rest if (defined $rest);
raise ValueError('tokenization does not match: "%s" vs "%s"' % (token, sentence))

# Delete the token from the begining of the sentence.
sentence = sentence[len(token):]

# Set the SpaceAfter and SpacesAfter properly
m = re.match(r'\s+', sentence)
if m is not None:
spaces_after = m.group(0)
sentence = sentence[len(spaces_after):]

# normalize whitespace
if self.normalize_spaces:
spaces_before = ""
# spaces_after = "" <=> SpaceAfter=No is never set for the last token <=> len(sentence) = 0
spaces_after = "" if not len(spaces_after) and len(sentence) else " "

# create a new node
node = root.create_child(form=token)
node.ord = i
if not space_after:
node.misc = 'SpaceAfter=No'

if i == 1 and spaces_before:
node.misc["SpacesBefore"] = spaces_before.translate(self.escape_whitespace_table)
if not spaces_after:
node.misc["SpaceAfter"] = 'No'
elif spaces_after != " ":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, escape_whitespace_table could be a module-level constant, instead of class-level, but I have no strong preference here, so I will merge this now.

node.misc["SpacesAfter"] = spaces_after.translate(self.escape_whitespace_table)