Skip to content

Commit 27ce163

Browse files
committed
Remove add_labels function, refactor, update README, add ChangeLog
1 parent c480c47 commit 27ce163

File tree

11 files changed

+91
-62
lines changed

11 files changed

+91
-62
lines changed

CHANGELOG.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# ChangeLog
2+
3+
## Version 0.2.0 (beta)
4+
5+
### Breaking changes
6+
7+
- IMatcher interface: the window size parameter *w* was removed from the *annot_text* function.
8+
In the Matcher class, this parameter becomes an attribute.
9+
- Matcher: remove *add_labels* function, *add_keywords* function supports an Iterable of string labels.
10+
- SpellWiseWrapper init: 'spellwise_algo' argument renamed 'measure' to be consistent with string distance algorithm.
11+
- SpellWiseWrapper *add_words_to_ignore* is deprecated, moved to the init function.
12+
- Fuzzyregex init: 'algo_name' argument renamed 'name'.
13+
14+
15+
### Enhancement
16+
17+
- Added support for pysimstring library (string distance fuzzy algorithm).
18+
- Created StringDistance, parent class of pysimstring and spellwise wrapper.
19+
- Created IWords2ignore for StringDistance algorithms to ignore common words.
20+
This speeds up these algorithms and reduces the false positive rate.
21+
- Added a Matcher.build() function that greatly simplifies the construction of the Matcher.
22+
The documentation has been updated accordingly.
23+
- Added an *IBaseMatcher* interface to be the main interface of any IAMsystem matcher (currently only one)
24+
which should not be changed in the future.
25+
26+
## Version 0.1.1 (alpha)
27+
28+
### Initial release

README.md

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ matcher = Matcher.build(
3232
keywords=["North America", "South America"],
3333
stopwords=["and"],
3434
abbreviations=[("amer", "America")],
35-
spellwise=[dict(algo=ESpellWiseAlgo.LEVENSHTEIN, max_distance=1)],
35+
spellwise=[dict(measure=ESpellWiseAlgo.LEVENSHTEIN, max_distance=1)],
3636
w=2,
3737
)
3838
annots = matcher.annot_text(text="Northh and south Amer.")
@@ -46,13 +46,16 @@ for annot in annots:
4646
## Algorithm
4747
The algorithm was developed in the context of a [PhD thesis](https://theses.hal.science/tel-03857962/).
4848
It proposes a solution to quickly annotate documents using a large dictionary (> 300K keywords) and fuzzy matching algorithms.
49-
No string distance algorithm is implemented in this package, it imports and leverages external libraries like [spellwise](https://github.com/chinnichaitanya/spellwise)
50-
and [nltk](https://github.com/nltk/nltk).
51-
Its algorithmic complexity is O(n(log(m))) with n the number of tokens in a document and m the size of the dictionary.
49+
No string distance algorithm is implemented in this package, it imports and leverages external libraries like [spellwise](https://github.com/chinnichaitanya/spellwise),
50+
[pysimstring](https://github.com/percevalw/pysimstring) and [nltk](https://github.com/nltk/nltk).
51+
Its algorithmic complexity is *O(n(log(m)))* with n the number of tokens in a document and m the size of the dictionary.
5252
The formalization of the algorithm is available in this [paper](https://ceur-ws.org/Vol-3202/livingner-paper11.pdf).
5353

54-
The algorithm was initially developed in Java (https://github.com/scossin/IAMsystem) and
55-
has participated in several semantic annotation competitions in the medical domain where it has obtained very satisfactory results.
54+
The algorithm was initially developed in Java (https://github.com/scossin/IAMsystem).
55+
It has participated in several semantic annotation competitions in the medical field where it has obtained satisfactory results,
56+
for example by obtaining the best results in the [Codiesp shared task]([https://temu.bsc.es/codiesp/index.php/2019/09/19/awards/).
57+
A dictionary-based model can achieve close performance to a transformer-based model when the task is simple or when the training set is small.
58+
Its main advantage is its speed, which allows a baseline to be generated quickly.
5659

5760
### Citation
5861
```
@@ -69,8 +72,3 @@ has participated in several semantic annotation competitions in the medical doma
6972
keywords = {Computer Science - Computation and Language},
7073
}
7174
```
72-
73-
## Changelog
74-
75-
**0.1.1**
76-
* First release

src/iamsystem/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
__all__ = [
22
"Matcher",
3+
"IMatcher",
4+
"IBaseMatcher",
35
"Annotation",
46
"rm_nested_annots",
57
"IStopwords",
@@ -71,6 +73,8 @@
7173
from iamsystem.matcher.annotation import Annotation
7274
from iamsystem.matcher.annotation import replace_annots
7375
from iamsystem.matcher.annotation import rm_nested_annots
76+
from iamsystem.matcher.api import IBaseMatcher
77+
from iamsystem.matcher.api import IMatcher
7478
from iamsystem.matcher.matcher import Matcher
7579
from iamsystem.stopwords.api import IStopwords
7680
from iamsystem.stopwords.negative import NegativeStopwords

src/iamsystem/fuzzy/spellwise.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,15 @@ class SpellWiseWrapper(StringDistance):
6161

6262
def __init__(
6363
self,
64-
algo: ESpellWiseAlgo,
64+
measure: ESpellWiseAlgo,
6565
max_distance: int,
6666
min_nb_char=5,
6767
words2ignore: Optional[IWords2ignore] = None,
6868
name: str = None,
6969
):
7070
"""Create an instance to take advantage of a spellwise algorithm.
7171
72-
:param algo: A value from :class:`~iamsystem.SpellWiseAlgo`
72+
:param measure: A value from :class:`~iamsystem.SpellWiseAlgo`
7373
enumerated list.
7474
:param max_distance: maximum edit distance
7575
(see spellwise documentation).
@@ -81,11 +81,11 @@ def __init__(
8181
Default: spellwise algorithm's name.
8282
"""
8383
if name is None:
84-
name = algo.name
84+
name = measure.name
8585
super().__init__(
8686
name=name, min_nb_char=min_nb_char, words2ignore=words2ignore
8787
)
88-
self._suggester: ISpellWiseAlgo = algo.value()
88+
self._suggester: ISpellWiseAlgo = measure.value()
8989
self._max_distance = max_distance
9090

9191
@property

src/iamsystem/matcher/abstract.py

Whitespace-only changes.

src/iamsystem/matcher/matcher.py

Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -191,15 +191,22 @@ def tokenize(self, text: str) -> Sequence[TokenT]:
191191
"""
192192
return self._tokenizer.tokenize(text=text)
193193

194-
def add_labels(self, labels: Iterable[str]) -> None:
195-
"""Utility function to call 'add_keywords' by providing a list of
196-
labels, :class:`~iamsystem.IKeyword` instances are created and added.
194+
def add_keywords(self, keywords: Iterable[Union[str, IKeyword]]) -> None:
195+
"""Utility function to add multiple keywords.
197196
198-
:param labels: the labels (keywords) to be searched in the document.
197+
:param keywords: an iterable of string (labels) or
198+
:class:`~iamsystem.IKeyword` to search in a document.
199199
:return: None.
200200
"""
201-
keywords = [Keyword(label=label) for label in labels]
202-
self.add_keywords(keywords=keywords)
201+
for kw in keywords:
202+
if isinstance(kw, str):
203+
kw = Keyword(label=kw)
204+
if not isinstance(kw, IKeyword):
205+
raise TypeError(
206+
f"{kw.__class__} is neither a string "
207+
f"or a class that implements the IKeyword interface."
208+
)
209+
self.add_keyword(keyword=kw)
203210

204211
def add_keyword(self, keyword: IKeyword) -> None:
205212
"""Add a keyword to find in a document.
@@ -214,15 +221,6 @@ def add_keyword(self, keyword: IKeyword) -> None:
214221
stopwords=self,
215222
)
216223

217-
def add_keywords(self, keywords: Iterable[IKeyword]) -> None:
218-
"""Utility function to add multiple keywords.
219-
220-
:param keywords: :class:`~iamsystem.IKeyword` to search in a document.
221-
:return: None.
222-
"""
223-
for keyword in keywords:
224-
self.add_keyword(keyword=keyword)
225-
226224
@property
227225
def keywords(self) -> Collection[IKeyword]:
228226
"""Return the keywords added."""
@@ -391,16 +389,7 @@ def build(
391389
matcher.remove_nested_annots = remove_nested_annots
392390

393391
# Add the keywords
394-
for kw in keywords:
395-
if isinstance(kw, str):
396-
matcher.add_labels(labels=[kw])
397-
elif isinstance(kw, IKeyword):
398-
matcher.add_keyword(keyword=kw)
399-
else:
400-
raise ValueError(
401-
f"{kw.__class__} is neither a string "
402-
f"or a class that implements the IKeyword interface."
403-
)
392+
matcher.add_keywords(keywords=keywords)
404393

405394
# add negative stopwords after stopwords and keywords are added
406395
# since this class needs keywords'unigrams without stopwords.

tests/test_annotation.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def test_rm_nested_terms_right_overlapping(self):
1717
"""Since 'prostate cancer' overlaps 'cancer', 'cancer' is a nested
1818
annotation to remove. 'cancer' is right-most token."""
1919
matcher = Matcher()
20-
matcher.add_labels(labels=["prostate cancer", "cancer"])
20+
matcher.add_keywords(keywords=["prostate cancer", "cancer"])
2121
matcher.remove_nested_annots = False
2222
annots = matcher.annot_text(text="prostate cancer")
2323
self.assertEqual(2, len(annots))
@@ -51,7 +51,9 @@ def test_rm_nested_terms_middle(self):
5151
Check it work with a middle term.
5252
"""
5353
matcher = Matcher()
54-
matcher.add_labels(labels=["prostate cancer undocumented", "cancer"])
54+
matcher.add_keywords(
55+
keywords=["prostate cancer undocumented", "cancer"]
56+
)
5557
matcher.remove_nested_annots = False
5658
annots = matcher.annot_text(text="prostate cancer undocumented")
5759
self.assertEqual(2, len(annots))

tests/test_doc.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ def test_readme_example(self):
2222
keywords=["North America", "South America"],
2323
stopwords=["and"],
2424
abbreviations=[("amer", "America")],
25-
spellwise=[dict(algo=ESpellWiseAlgo.LEVENSHTEIN, max_distance=1)],
25+
spellwise=[
26+
dict(measure=ESpellWiseAlgo.LEVENSHTEIN, max_distance=1)
27+
],
2628
w=2,
2729
)
2830
annots = matcher.annot_text(text="Northh and south Amer.")
@@ -609,11 +611,11 @@ def test_spellwise(self):
609611
keywords=[term1],
610612
spellwise=[
611613
dict(
612-
algo=ESpellWiseAlgo.LEVENSHTEIN,
614+
measure=ESpellWiseAlgo.LEVENSHTEIN,
613615
max_distance=1,
614616
min_nb_char=5,
615617
),
616-
dict(algo=ESpellWiseAlgo.SOUNDEX, max_distance=1),
618+
dict(measure=ESpellWiseAlgo.SOUNDEX, max_distance=1),
617619
],
618620
)
619621
annots = matcher.annot_text(text="acute resiratory distresssss")
@@ -633,7 +635,7 @@ def test_string_distance_ignored_w(self):
633635
keywords=["poids"],
634636
spellwise=[
635637
dict(
636-
algo=ESpellWiseAlgo.LEVENSHTEIN,
638+
measure=ESpellWiseAlgo.LEVENSHTEIN,
637639
max_distance=1,
638640
min_nb_char=4,
639641
)
@@ -646,7 +648,7 @@ def test_string_distance_ignored_w(self):
646648
keywords=["poids"],
647649
spellwise=[
648650
dict(
649-
algo=ESpellWiseAlgo.LEVENSHTEIN,
651+
measure=ESpellWiseAlgo.LEVENSHTEIN,
650652
max_distance=1,
651653
min_nb_char=4,
652654
)
@@ -695,7 +697,7 @@ def test_cache_fuzzy_algos(self):
695697
abbs = Abbreviations(name="abbs")
696698
abbs.add(short_form="a", long_form="acute", tokenizer=matcher)
697699
test = dict(
698-
algo=ESpellWiseAlgo.LEVENSHTEIN, max_distance=1, min_nb_char=5
700+
measure=ESpellWiseAlgo.LEVENSHTEIN, max_distance=1, min_nb_char=5
699701
)
700702
levenshtein = SpellWiseWrapper(**test)
701703
soundex = SpellWiseWrapper(ESpellWiseAlgo.SOUNDEX, max_distance=1)

tests/test_matcher.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,11 @@ def test_detect_exact_match(self):
6868

6969
def test_add_labels(self):
7070
"""This function add keywords that can be detected."""
71-
detector = Matcher()
71+
matcher = Matcher()
7272
words = ["acute respiratory distress syndrome", "diarrrhea"]
73-
detector.add_labels(labels=words)
73+
matcher.add_keywords(keywords=words)
7474
text = "Pt c/o acute respiratory distress syndrome and diarrrhea"
75-
annots = detector.annot_text(text=text)
75+
annots = matcher.annot_text(text=text)
7676
self.assertEqual(2, len(annots))
7777

7878
def test_keywords_attribute(self):
@@ -153,7 +153,7 @@ def test_unordered_words_seq(self):
153153
matcher = Matcher(tokenizer=tokenizer)
154154
tokens = matcher.tokenize(text)
155155
self.assertEqual("d", tokens[0].norm_label)
156-
matcher.add_labels(labels=["insuffisance ventriculaire gauche"])
156+
matcher.add_keywords(keywords=["insuffisance ventriculaire gauche"])
157157
matcher.w = 10
158158
annots = matcher.annot_text(text=text)
159159
self.assertEqual(1, len(annots))
@@ -323,7 +323,9 @@ def test_spellwise(self):
323323

324324
matcher = Matcher.build(
325325
keywords=get_termino_ivg(),
326-
spellwise=[dict(algo=ESpellWiseAlgo.LEVENSHTEIN, max_distance=1)],
326+
spellwise=[
327+
dict(measure=ESpellWiseAlgo.LEVENSHTEIN, max_distance=1)
328+
],
327329
)
328330
annots = matcher.annot_text(text=text)
329331
self.assertEqual(1, len(annots))
@@ -333,7 +335,9 @@ def test_spellwise_param_order(self):
333335
text = "insuffisance cardiaqu gauche"
334336
matcher = Matcher.build(
335337
keywords=get_termino_ivg(),
336-
spellwise=[dict(max_distance=1, algo=ESpellWiseAlgo.LEVENSHTEIN)],
338+
spellwise=[
339+
dict(max_distance=1, measure=ESpellWiseAlgo.LEVENSHTEIN)
340+
],
337341
)
338342
annots = matcher.annot_text(text=text)
339343
self.assertEqual(1, len(annots))
@@ -344,7 +348,7 @@ def test_spellwise_wrong_param(self):
344348
Matcher.build(
345349
keywords=get_termino_ivg(),
346350
spellwise=[
347-
dict(max_distance=1, measure=ESpellWiseAlgo.LEVENSHTEIN)
351+
dict(max_distance=1, algo=ESpellWiseAlgo.LEVENSHTEIN)
348352
],
349353
)
350354

@@ -359,7 +363,7 @@ def test_spellwise_params_overrides(self):
359363
spellwise=[
360364
dict(
361365
max_distance=1,
362-
algo=ESpellWiseAlgo.LEVENSHTEIN,
366+
measure=ESpellWiseAlgo.LEVENSHTEIN,
363367
words2ignore=words2ignore,
364368
)
365369
],
@@ -374,7 +378,7 @@ def test_spellwise_params_overrides(self):
374378
spellwise=[
375379
dict(
376380
max_distance=1,
377-
algo=ESpellWiseAlgo.LEVENSHTEIN,
381+
measure=ESpellWiseAlgo.LEVENSHTEIN,
378382
# words2ignore=words2ignore
379383
)
380384
],

tests/test_simstring.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
class MatcherTest(unittest.TestCase):
1010
def setUp(self) -> None:
1111
self.matcher = Matcher()
12-
self.matcher.add_labels(labels=["paracetamol", "les"])
12+
self.matcher.add_keywords(keywords=["paracetamol", "les"])
1313

1414
def test_threshold_1(self):
1515
"""Test threshold=1 is exact match"""

0 commit comments

Comments
 (0)