Skip to content
92 changes: 74 additions & 18 deletions udapi/block/tokenize/onwhitespace.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,72 @@
"""Block tokenize.OnWhitespace"""
import re
from udapi.core.block import Block


class OnWhitespace(Block):
""""Base tokenizer, splits on whitespaces, fills SpaceAfter=No."""
"""Base tokenizer, splits on whitespaces, fills SpaceAfter=No.

Use the parameter `normalize_spaces=False` to preserve all whitespaces in the sentence
in the UDPipe way, i.e. using the `SpacesAfter` and `SpacesBefore` features in the MISC field.
It is backward compatible with CoNLL-U v2 `SpaceAfter=No` feature. That is, no following
whitespace is marked by `SpaceAfter=No` and a single following space results in no
whitespace-related markup.
If loading the text using `read.Sentences` and all whitespaces need to be preserved
(in order to be able to reconstruct the original document), the `read.Sentences` block
must be called with `rstrip=\n` or `rstrip=\r\n` to prevent stripping the trailing
whitespace, e.g.::
$> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace normalize_spaces=0 write.Conllu

# sent_id = 1
# text = Hello world
1 Hello _ _ _ _ 0 _ _ SpacesAfter=\s\t\s
2 world _ _ _ _ 0 _ _ _
Note that the attribute `SpaceAfter=No` is missing for the token `world`, since it is
followed by a single space.

Parameters
----------
normalize_spaces : bool
preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default True)
"""

def __init__(self, normalize_spaces=True, **kwargs):
super().__init__(**kwargs)
self.normalize_spaces = normalize_spaces

@staticmethod
def tokenize_sentence(string):
"""A method to be overriden in subclasses."""
return string.split()

@staticmethod
def escape_whitespace(string):
string = re.sub(r' ', r'\\s', string)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand it correctly, it should be:
string = re.sub(' ', r'\s', string)
which could be also written as
string = re.sub(' ', '\\s', string)

Have you checked the code? Is there a test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I didn't realize I don't need to escape \s in the replacement string. However, it seems to work the same in any of the three variants of the replacement string. That's why I didn't notice. I'll change it to the first variant you suggest.

Of course, I checked the code as well as the output on several real-world and made-up examples. But I didn't write a unit test for it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wrong, sorry. I forgot re.sub interprets also the replacement parameter (e.g. for \1). You can use either
string = re.sub('\t', r'\\t', string)
or
string = re.sub(r'\t', r'\\t', string)
In the first case, re.sub gets a single-character string as the pattern. In the second case, a two-char string \t, but regular expressions interpret it as tab.
If you use
string = re.sub('\t', r'\t', string)
regular expression decoding is applied also on the replacement, so you would store a tab character in MISC, which is not what you want.
If you use (as I suggested)
string = re.sub(' ', r'\s', string)
you will get re.error: bad escape \s at position 0, because \s cannot appear in the replacement.

That said, my final suggestion
str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'})
is correct because there are no regular expressions involved.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the commands you suggested wouldn't work for '\t', '\r' or '\n'. I thought your comment concerns only '\s'. Because all three ways of writing it actually work in Python 3.6.3:

In [1]: a = " " In [2]: import re In [3]: re.sub(' ', r'\s', a) Out[3]: '\\s' In [4]: re.sub(' ', r'\\s', a) Out[4]: '\\s' In [5]: re.sub(' ', '\\s', a) Out[5]: '\\s' 

Anyway, I changed it to the maketrans + translate solution.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I add a test?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re.sub(' ', '\s', 'a b') works in Python 3.6.9, but fails in Python 3.7.0.

If you have time and energy to add a test, it would be nice, but there are so many tests missing (my fault)...

string = re.sub(r'\t', r'\\t', string)
string = re.sub(r'\r', r'\\r', string)
string = re.sub(r'\n', r'\\n', string)
return string

def process_tree(self, root):
if root.children:
raise ValueError('Tree %s is already tokenized.' % root)
sentence = ' '.join(root.text.split())
#sentence = ' '.join(root.text.split())
sentence = root.text
tokens = self.tokenize_sentence(sentence)

# Check if there are any spaces before the first token
spaces_before = ""
m = re.match(r'\s+', sentence)
if m:
spaces_before = m.group(0)
sentence = sentence[len(spaces_before):]

for i, token in enumerate(tokens, 1):
space_after = False
spaces_after = ""

# Delete the token from the begining of the sentence.
if sentence.startswith(token):
sentence = sentence[len(token):]
# This is the expected case. The sentence starts with the token.
# If it is followed by a space, delete the space and set space_after=True.
if not len(sentence):
space_after = True
elif sentence.startswith(' '):
space_after = True
sentence = sentence[1:]
else:
# The token (returned from tokenization) does not match the start of sentence.
# E.g. '. . . word' is tokenized as '... word'.
# The token (returned from tokenization) does not match the start of sentence.
# E.g. '. . . word' is tokenized as '... word'.
if not sentence.startswith(token):
# Let's delete the start of sentence anyway,
# using a non-greedy regex and the expected next token
# returned from the tokenization.
Expand All @@ -40,8 +76,28 @@ def process_tree(self, root):
# $sentence = $rest if (defined $rest);
raise ValueError('tokenization does not match: "%s" vs "%s"' % (token, sentence))

# Delete the token from the begining of the sentence.
sentence = sentence[len(token):]

# Set the SpaceAfter and SpacesAfter properly
m = re.match(r'\s+', sentence)
if m is not None:
spaces_after = m.group(0)
sentence = sentence[len(spaces_after):]

# normalize whitespace
if self.normalize_spaces:
spaces_before = ""
# spaces_after = "" <=> SpaceAfter=No is never set for the last token <=> len(sentence) = 0
spaces_after = "" if not len(spaces_after) and len(sentence) else " "

# create a new node
node = root.create_child(form=token)
node.ord = i
if not space_after:
node.misc = 'SpaceAfter=No'

if i == 1 and len(spaces_before) > 0:
node.misc["SpacesBefore"] = self.escape_whitespace(spaces_before)
if not len(spaces_after):
node.misc["SpaceAfter"] = 'No'
elif spaces_after != " ":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, escape_whitespace_table could be a module-level constant, instead of class-level, but I have no strong preference here, so I will merge this now.

node.misc["SpacesAfter"] = self.escape_whitespace(spaces_after)