Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions udapi/block/segment/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,20 @@
import re

class Simple(Block):
""""Heuristic segmenter, splits on sentence-final segmentation followed by uppercase."""
""""Heuristic segmenter, splits on sentence-final segmentation followed by uppercase.
The exceptions are:
1) abbreviations of names, e.g. "A. Merkel"
2) predefined list of nonfinal abbreviations, e.g. "e.g."

Parameters
----------
keep_spaces : bool
do not strip whitespaces from the `text` attribute of the sentences created by segmentation
"""

def __init__(self, keep_spaces=False, **kwargs):
super().__init__(**kwargs)
self.keep_spaces = keep_spaces

@staticmethod
def is_nonfinal_abbrev(token):
Expand All @@ -16,6 +29,8 @@ def is_nonfinal_abbrev(token):

def is_boundary(self, first, second):
"""Is there a sentence boundary between the first and second token?"""
if not first or not second:
return False
if first[-1] in '"“»›)':
first = first[:-1]
if second[0] in '"„«¿¡‹(':
Expand All @@ -25,6 +40,9 @@ def is_boundary(self, first, second):
if not first[-1] in '.!?':
return False
if first[-1] == '.':
# correctly count length in "„A. Merkel"
if first[0] in '"„«¿¡‹(':
first = first[1:]
if len(first) == 2 and first[0].isupper():
return False
if self.is_nonfinal_abbrev(first[:-1]):
Expand All @@ -39,6 +57,8 @@ def segment_string(self, string):
segments = [previous]
for token in tokens[1:]:
if self.is_boundary(previous, token):
if self.keep_spaces:
segments[-1] += ' '
segments.append(token)
else:
segments[-1] += ' ' + token
Expand All @@ -64,4 +84,4 @@ def process_document(self, doc):
new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i))
new_bundle.create_tree(tree.zone).text = sentence
new_bundles.append(new_bundle)
doc.bundles = new_bundles
doc.bundles = new_bundles