Skip to content

Commit c26bd48

Browse files
author
sebastien cossin
committed
add fuzzy algorithm based on simstring algorithm called from pysimstring library
1 parent a46cda4 commit c26bd48

File tree

6 files changed

+285
-3
lines changed

6 files changed

+285
-3
lines changed

docs/source/api_doc.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,21 @@ ESpellWiseAlgo
238238
:undoc-members:
239239
:show-inheritance:
240240

241+
SimString
242+
^^^^^^^^^
243+
SimStringWrapper
244+
""""""""""""""""
245+
.. autoclass:: iamsystem.fuzzy.SimStringWrapper
246+
:members:
247+
:undoc-members:
248+
:show-inheritance:
241249

250+
ESimStringMeasure
251+
"""""""""""""""""
252+
.. autoclass:: iamsystem.fuzzy.ESimStringMeasure
253+
:members:
254+
:undoc-members:
255+
:show-inheritance:
242256

243257
Brat
244258
----

docs/source/fuzzy.rst

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,15 @@ if you use a custom tokenizer (i.e. from an external library like spaCy) you can
139139
String Distance
140140
^^^^^^^^^^^^^^^
141141
.. _spellwise: https://github.com/chinnichaitanya/spellwise
142+
.. _pysimstring: https://github.com/percevalw/pysimstring
142143

143-
This package utilizes the `spellwise`_ python library to access string distance algorithms.
144-
In the example below, iamsystem is configured with two spelling algorithms:
144+
145+
This package utilizes the `spellwise`_ and `pysimstring`_ libraries to access string distance algorithms.
146+
147+
Spellwise
148+
"""""""""
149+
150+
In the example below, iamsystem is configured with two spellwise algorithms:
145151
Levenshtein distance which measures the number of edits needed to transform one word into another,
146152
and Soundex which is a phonetic algorithm.
147153

@@ -181,6 +187,40 @@ When the number of keywords is large, these algorithms can be slow.
181187
Since their output doesn't depend on the context,
182188
I recommend using the :ref:`fuzzy:CacheFuzzyAlgos` class to store them.
183189

190+
simstring
191+
"""""""""
192+
.. _simstring: http://chokkan.org/software/simstring/
193+
194+
The `pysimstring`_ library provides an API to the fast `simstring`_ algorithm implemented in C++.
195+
196+
In the example below, all the unigrams of the keywords are indexed by simstring.
197+
Then, for each token in the document, simstring is called to return the closed matched.
198+
199+
.. code-block:: python
200+
201+
from iamsystem.fuzzy.simstring import (
202+
SimStringWrapper,
203+
ESimStringMeasure,
204+
)
205+
from iamsystem import Term, Matcher
206+
207+
term1 = Term(label="acute respiratory distress", code="J80")
208+
matcher = Matcher()
209+
matcher.add_keywords(keywords=[term1])
210+
fuzzy_ss = SimStringWrapper(
211+
words=matcher.get_keywords_unigrams(),
212+
measure=ESimStringMeasure.COSINE,
213+
threshold=0.7,
214+
)
215+
matcher.add_fuzzy_algo(fuzzy_algo=fuzzy_ss)
216+
annots = matcher.annot_text(text="acute respiratori disstress")
217+
for annot in annots:
218+
print(annot)
219+
# acute respiratori disstress 0 27 acute respiratory distress (J80)
220+
221+
Using the cosine similarity and a threshold of 0.7,
222+
the tokens *respiratori* matched to *respiratory* and *disstress* matched to *distress*.
223+
184224
CacheFuzzyAlgos
185225
^^^^^^^^^^^^^^^
186226

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ keywords = ["NLP", "semantic annotation", "entity linking"]
2020
dependencies = ['unidecode>=1.1.1', 'typing-extensions~=4.4.0', 'spellwise>=0.8.0']
2121

2222
[project.optional-dependencies]
23-
tests = ['spellwise>=0.8.0', 'nltk>=3.8.0', 'spacy>=3.2.0']
23+
tests = ['spellwise>=0.8.0', 'nltk>=3.8.0', 'spacy>=3.2.0', 'pysimstring==1.2.1']
2424
doc = ["sphinx"]
2525

2626
[project.urls]

src/iamsystem/fuzzy/simstring.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
""" pysimstring library wrapper."""
2+
import os
3+
import tempfile
4+
5+
from enum import Enum
6+
from typing import Iterable
7+
8+
from pysimstring import simstring
9+
10+
from iamsystem.fuzzy.api import NormLabelAlgo
11+
from iamsystem.fuzzy.api import SynType
12+
13+
14+
class ESimStringMeasure(Enum):
15+
"""Enumerated list of simstring measures."""
16+
17+
EXACT = "exact"
18+
DICE = "dice"
19+
COSINE = "cosine"
20+
JACCARD = "jaccard"
21+
OVERLAP = "overlap"
22+
23+
24+
class SimStringWrapper(NormLabelAlgo):
25+
"""SimString algorithm interface."""
26+
27+
def __init__(
28+
self,
29+
words=Iterable[str],
30+
name: str = "simstring",
31+
measure=ESimStringMeasure.JACCARD,
32+
threshold=0.5,
33+
):
34+
"""Create a fuzzy algorithm that calls simstring.
35+
36+
:param words: the words to index in the simstring database.
37+
An easy way to provide these words is to call
38+
:py:meth:`~iamsystem.Matcher.get_keywords_unigrams` method after
39+
you added your keywords to the matcher instance.
40+
:param name: a name given to this algorithm. Default "simstring".
41+
:param measure: a similarity measure selected from
42+
:class:`~iamsystem.fuzzy.simstring.ESimStringMeasure`.
43+
Default JACCARD.
44+
:param threshold: similarity measure threshold.
45+
"""
46+
super().__init__(name=name)
47+
self.path = tempfile.mkdtemp()
48+
os.makedirs(self.path, exist_ok=True)
49+
abs_path = os.path.join(self.path, "terms.simstring")
50+
with SimstringWriter(abs_path=abs_path) as ss_db:
51+
for word in words:
52+
ss_db.insert(word)
53+
self.ss_reader = simstring.reader(abs_path)
54+
self.ss_reader.measure = getattr(simstring, measure.value)
55+
self.ss_reader.threshold = threshold
56+
57+
def get_syns_of_word(self, word: str) -> Iterable[SynType]:
58+
"""Retrieve simstring similar words."""
59+
ss_words = self.ss_reader.retrieve(word)
60+
return [self.word_to_syn(word=word) for word in ss_words]
61+
62+
def __del__(self):
63+
"""close the file connection to simstring db."""
64+
# The safer approach is to open the file for every 'get_syns_of_word'
65+
# call. However, it takes more time. It seems to be ok to close
66+
# the file here.
67+
# https://stackoverflow.com/questions/44142836/open-file-inside-class
68+
self.ss_reader.close()
69+
70+
71+
class SimstringWriter:
72+
"""Utility class to create a simstring database.
73+
a plagiarism of https://github.com/percevalw/pysimstring/blob/master/tests/test_simstring.py # noqa
74+
"""
75+
76+
def __init__(self, abs_path: str):
77+
"""A context class to write a simstring database
78+
79+
:param abs_path: absolute path to the file.
80+
"""
81+
self.abs_path = abs_path
82+
83+
def __enter__(self):
84+
"""Open the file"""
85+
self.db = simstring.writer(self.abs_path, 3, False, True)
86+
return self
87+
88+
def __exit__(self, exc_type, exc_val, exc_tb):
89+
"""Close the file"""
90+
self.db.close()
91+
92+
def insert(self, term):
93+
"""insert a term in simstring file."""
94+
self.db.insert(term)

tests/test_doc.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,28 @@ def test_spellwise(self):
575575
# acute resiratory distresssss 0 28 acute respiratory distress (J80) acute(exact,LEVENSHTEIN,SOUNDEX);resiratory(LEVENSHTEIN);distresssss(SOUNDEX) # noqa
576576
self.assertEqual(1, len(annots))
577577

578+
def test_simstring(self):
579+
"""Simstring example."""
580+
from iamsystem import Matcher
581+
from iamsystem import Term
582+
from iamsystem.fuzzy.simstring import ESimStringMeasure
583+
from iamsystem.fuzzy.simstring import SimStringWrapper
584+
585+
term1 = Term(label="acute respiratory distress", code="J80")
586+
matcher = Matcher()
587+
matcher.add_keywords(keywords=[term1])
588+
fuzzy_ss = SimStringWrapper(
589+
words=matcher.get_keywords_unigrams(),
590+
measure=ESimStringMeasure.COSINE,
591+
threshold=0.7,
592+
)
593+
matcher.add_fuzzy_algo(fuzzy_algo=fuzzy_ss)
594+
annots = matcher.annot_text(text="acute respiratori disstress")
595+
for annot in annots:
596+
print(annot)
597+
# acute respiratori disstress 0 27 acute respiratory distress (J80)
598+
self.assertEqual(1, len(annots))
599+
578600
def test_cache_fuzzy_algos(self):
579601
"""Cache example."""
580602
from iamsystem import Abbreviations

tests/test_simstring.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import unittest
2+
3+
from iamsystem import CacheFuzzyAlgos
4+
from iamsystem import Matcher
5+
from iamsystem.fuzzy.simstring import ESimStringMeasure
6+
from iamsystem.fuzzy.simstring import SimStringWrapper
7+
8+
9+
class MatcherTest(unittest.TestCase):
10+
def setUp(self) -> None:
11+
self.matcher = Matcher()
12+
self.matcher.add_labels(labels=["paracetamol", "les"])
13+
14+
def test_threshold_1(self):
15+
"""Test threshold=1 is exact match"""
16+
fuzzy_ss = SimStringWrapper(words=["paracetamol"], threshold=1)
17+
syns = list(fuzzy_ss.get_syns_of_word("paracetomol"))
18+
self.assertEqual(0, len(syns))
19+
syns = list(fuzzy_ss.get_syns_of_word("paracetamol"))
20+
self.assertEqual(1, len(syns))
21+
22+
def test_threshold_0_5(self):
23+
"""Test synonyms returned depend on threshold."""
24+
fuzzy_ss = SimStringWrapper(words=["paracetamol"])
25+
syns = list(fuzzy_ss.get_syns_of_word("paracetomol"))
26+
self.assertEqual(1, len(syns))
27+
syns = list(fuzzy_ss.get_syns_of_word("para"))
28+
self.assertEqual(0, len(syns))
29+
30+
def test_threshold_0_2(self):
31+
"""Test synonyms returned depend on threshold."""
32+
fuzzy_ss = SimStringWrapper(words=["paracetamol"], threshold=0.2)
33+
syns = list(fuzzy_ss.get_syns_of_word("paracetomol"))
34+
self.assertEqual(1, len(syns))
35+
syns = list(fuzzy_ss.get_syns_of_word("para"))
36+
self.assertEqual(1, len(syns))
37+
38+
def test_measure_exact(self):
39+
"""Test synonyms returned depend on threshold."""
40+
fuzzy_ss = SimStringWrapper(
41+
words=["paracetamol"], measure=ESimStringMeasure.EXACT
42+
)
43+
syns = list(fuzzy_ss.get_syns_of_word("paracetomol"))
44+
self.assertEqual(0, len(syns))
45+
syns = list(fuzzy_ss.get_syns_of_word("paracetamol"))
46+
self.assertEqual(1, len(syns))
47+
48+
def test_other_measures(self):
49+
"""Test other similarity measure ; check it returns a synonym."""
50+
for measure in ESimStringMeasure:
51+
if measure.value == "exact":
52+
continue
53+
fuzzy_ss = SimStringWrapper(words=["paracetamol"], measure=measure)
54+
syns = list(fuzzy_ss.get_syns_of_word("paracetomol"))
55+
self.assertEqual(1, len(syns))
56+
57+
def test_matcher(self):
58+
"""Test detection with a matcher"""
59+
fuzzy_ss = SimStringWrapper(words=self.matcher.get_keywords_unigrams())
60+
self.matcher.add_fuzzy_algo(fuzzy_algo=fuzzy_ss)
61+
annots = self.matcher.annot_text(text="le paractamol")
62+
self.assertEqual(1, len(annots))
63+
64+
def test_cache_fuzzy_algos(self):
65+
"""Test it can work with CacheFuzzyAlgos."""
66+
fuzzy_ss = SimStringWrapper(words=self.matcher.get_keywords_unigrams())
67+
cache = CacheFuzzyAlgos()
68+
cache.add_algo(algo=fuzzy_ss)
69+
self.matcher.add_fuzzy_algo(fuzzy_algo=cache)
70+
annots = self.matcher.annot_text(text="le paractamol")
71+
self.assertEqual(1, len(annots))
72+
73+
def test_combine_multiple_algos(self):
74+
"""Test we can add multiple simstring algorithms."""
75+
fuzzy_dice = SimStringWrapper(
76+
words=self.matcher.get_keywords_unigrams(),
77+
name="ss_dice",
78+
measure=ESimStringMeasure.DICE,
79+
)
80+
fuzzy_jaccard = SimStringWrapper(
81+
words=self.matcher.get_keywords_unigrams(),
82+
name="ss_jaccard",
83+
measure=ESimStringMeasure.JACCARD,
84+
)
85+
self.matcher.add_fuzzy_algo(fuzzy_algo=fuzzy_dice)
86+
self.matcher.add_fuzzy_algo(fuzzy_algo=fuzzy_jaccard)
87+
annots = self.matcher.annot_text(text="le paractamol")
88+
self.assertEqual(1, len(annots))
89+
annot = annots[0]
90+
algos_token_0 = annot.algos[0]
91+
self.assertEqual(["ss_dice", "ss_jaccard"], algos_token_0)
92+
93+
def test_combine_multiple_algos_2(self):
94+
"""Test the two simstring databases are independent to allow
95+
the user to customize different files."""
96+
fuzzy_dice = SimStringWrapper(
97+
words=self.matcher.get_keywords_unigrams(),
98+
name="ss_dice",
99+
measure=ESimStringMeasure.DICE,
100+
)
101+
fuzzy_jaccard = SimStringWrapper(
102+
words=["NothingInterestingHere"],
103+
name="ss_jaccard",
104+
measure=ESimStringMeasure.JACCARD,
105+
)
106+
self.matcher.add_fuzzy_algo(fuzzy_algo=fuzzy_dice)
107+
self.matcher.add_fuzzy_algo(fuzzy_algo=fuzzy_jaccard)
108+
annots = self.matcher.annot_text(text="le paractamol")
109+
self.assertEqual(1, len(annots))
110+
annot = annots[0]
111+
algos_token_0 = annot.algos[0]
112+
self.assertEqual(["ss_dice"], algos_token_0)

0 commit comments

Comments
 (0)