udapi · martinpopel · Feb 19, 2021 · Feb 15, 2021 · Feb 16, 2021 · Feb 19, 2021
diff --git a/udapi/block/tokenize/onwhitespace.py b/udapi/block/tokenize/onwhitespace.py
@@ -1,36 +1,72 @@
 """Block tokenize.OnWhitespace"""
+import re
 from udapi.core.block import Block
 
 
 class OnWhitespace(Block):
- """"Base tokenizer, splits on whitespaces, fills SpaceAfter=No."""
+ """Base tokenizer, splits on whitespaces, fills SpaceAfter=No.
+
+ Use the parameter `normalize_spaces=False` to preserve all whitespaces in the sentence
+ in the UDPipe way, i.e. using the `SpacesAfter` and `SpacesBefore` features in the MISC field.
+ It is backward compatible with CoNLL-U v2 `SpaceAfter=No` feature. That is, no following
+ whitespace is marked by `SpaceAfter=No` and a single following space results in no
+ whitespace-related markup.
+ If loading the text using `read.Sentences` and all whitespaces need to be preserved
+ (in order to be able to reconstruct the original document), the `read.Sentences` block
+ must be called with `rstrip=\n` or `rstrip=\r\n` to prevent stripping the trailing
+ whitespace, e.g.::
+ $> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace normalize_spaces=0 write.Conllu
+
+ # sent_id = 1
+ # text = Hello world
+ 1 Hello _ _ _ _ 0 _ _ SpacesAfter=\s\t\s
+ 2 world _ _ _ _ 0 _ _ _
+ Note that the attribute `SpaceAfter=No` is missing for the token `world`, since it is
+ followed by a single space.
+
+ Parameters
+ ----------
+ normalize_spaces : bool
+ preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default True)
+ """
+
+ def __init__(self, normalize_spaces=True, **kwargs):
+ super().__init__(**kwargs)
+ self.normalize_spaces = normalize_spaces
 
  @staticmethod
  def tokenize_sentence(string):
  """A method to be overriden in subclasses."""
  return string.split()
 
+ @staticmethod
+ def escape_whitespace(string):
+ string = re.sub(r' ', r'\\s', string)
+ string = re.sub(r'\t', r'\\t', string)
+ string = re.sub(r'\r', r'\\r', string)
+ string = re.sub(r'\n', r'\\n', string)
+ return string
+
  def process_tree(self, root):
  if root.children:
  raise ValueError('Tree %s is already tokenized.' % root)
- sentence = ' '.join(root.text.split())
+ #sentence = ' '.join(root.text.split())
+ sentence = root.text
  tokens = self.tokenize_sentence(sentence)
+
+ # Check if there are any spaces before the first token
+ spaces_before = ""
+ m = re.match(r'\s+', sentence)
+ if m:
+ spaces_before = m.group(0)
+ sentence = sentence[len(spaces_before):]
+
  for i, token in enumerate(tokens, 1):
- space_after = False
+ spaces_after = ""
 
- # Delete the token from the begining of the sentence.
- if sentence.startswith(token):
- sentence = sentence[len(token):]
- # This is the expected case. The sentence starts with the token.
- # If it is followed by a space, delete the space and set space_after=True.
- if not len(sentence):
- space_after = True
- elif sentence.startswith(' '):
- space_after = True
- sentence = sentence[1:]
- else:
- # The token (returned from tokenization) does not match the start of sentence.
- # E.g. '. . . word' is tokenized as '... word'.
+ # The token (returned from tokenization) does not match the start of sentence.
+ # E.g. '. . . word' is tokenized as '... word'.
+ if not sentence.startswith(token):
  # Let's delete the start of sentence anyway,
  # using a non-greedy regex and the expected next token
  # returned from the tokenization.
@@ -40,8 +76,28 @@ def process_tree(self, root):
  # $sentence = $rest if (defined $rest);
  raise ValueError('tokenization does not match: "%s" vs "%s"' % (token, sentence))
 
+ # Delete the token from the begining of the sentence.
+ sentence = sentence[len(token):]
+
+ # Set the SpaceAfter and SpacesAfter properly
+ m = re.match(r'\s+', sentence)
+ if m is not None:
+ spaces_after = m.group(0)
+ sentence = sentence[len(spaces_after):]
+
+ # normalize whitespace
+ if self.normalize_spaces:
+ spaces_before = ""
+ # spaces_after = "" <=> SpaceAfter=No is never set for the last token <=> len(sentence) = 0
+ spaces_after = "" if not len(spaces_after) and len(sentence) else " "
+
  # create a new node
  node = root.create_child(form=token)
  node.ord = i
- if not space_after:
- node.misc = 'SpaceAfter=No'
+
+ if i == 1 and len(spaces_before) > 0:
+ node.misc["SpacesBefore"] = self.escape_whitespace(spaces_before)
+ if not len(spaces_after):
+ node.misc["SpaceAfter"] = 'No'
+ elif spaces_after != " ":
+ node.misc["SpacesAfter"] = self.escape_whitespace(spaces_after)