Orginal version by chokkan and QuickUMLS.
This version removes the libiconv dependency which required a conda installation before installing the simstring package on Windows.
Install it:
pip install pysimstringand use it:
import os import tempfile from pathlib import Path from typing import Union import pysimstring.simstring as simstring class SimstringWriter: def __init__(self, path: Union[str, Path]): """ A context class to write a simstring database Parameters ---------- path: Union[str, Path] Path to database """ os.makedirs(path, exist_ok=True) self.path = path def __enter__(self): path = os.path.join(self.path, "terms.simstring") self.db = simstring.writer(path, 3, False, True) return self def __exit__(self, exc_type, exc_val, exc_tb): self.db.close() def insert(self, term): self.db.insert(term) def test_simstring(): terms = ["paracetamol", "doliprane"] path = tempfile.mkdtemp() with SimstringWriter(path) as ss_db: for term in terms: ss_db.insert("##" + term + "##") ss_reader = simstring.reader(os.path.join(path, "terms.simstring")) ss_reader.measure = getattr(simstring, "jaccard") ss_reader.threshold = 0.5 assert ss_reader.retrieve("##paracetomol##") == ("##paracetamol##",) assert ss_reader.retrieve("##doliprano##") == ("##doliprane##",)