udapi · martinpopel · Feb 25, 2021 · Feb 24, 2021 · Feb 25, 2021 · Feb 25, 2021
diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py
@@ -4,7 +4,20 @@
 import re
 
 class Simple(Block):
- """"Heuristic segmenter, splits on sentence-final segmentation followed by uppercase."""
+ """"Heuristic segmenter, splits on sentence-final segmentation followed by uppercase.
+ The exceptions are:
+ 1) abbreviations of names, e.g. "A. Merkel"
+ 2) predefined list of nonfinal abbreviations, e.g. "e.g."
+
+ Parameters
+ ----------
+ keep_spaces : bool
+ do not strip whitespaces from the `text` attribute of the sentences created by segmentation
+ """
+
+ def __init__(self, keep_spaces=False, **kwargs):
+ super().__init__(**kwargs)
+ self.keep_spaces = keep_spaces
 
  @staticmethod
  def is_nonfinal_abbrev(token):
@@ -16,6 +29,8 @@ def is_nonfinal_abbrev(token):
 
  def is_boundary(self, first, second):
  """Is there a sentence boundary between the first and second token?"""
+ if not first or not second:
+ return False
  if first[-1] in '"“»›)':
  first = first[:-1]
  if second[0] in '"„«¿¡‹(':
@@ -25,6 +40,9 @@ def is_boundary(self, first, second):
  if not first[-1] in '.!?':
  return False
  if first[-1] == '.':
+ # correctly count length in "„A. Merkel"
+ if first[0] in '"„«¿¡‹(':
+ first = first[1:]
  if len(first) == 2 and first[0].isupper():
  return False
  if self.is_nonfinal_abbrev(first[:-1]):
@@ -39,6 +57,8 @@ def segment_string(self, string):
  segments = [previous]
  for token in tokens[1:]:
  if self.is_boundary(previous, token):
+ if self.keep_spaces:
+ segments[-1] += ' '
  segments.append(token)
  else:
  segments[-1] += ' ' + token
@@ -64,4 +84,4 @@ def process_document(self, doc):
  new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i))
  new_bundle.create_tree(tree.zone).text = sentence
  new_bundles.append(new_bundle)
- doc.bundles = new_bundles
+ doc.bundles = new_bundles