Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 63 additions & 8 deletions camelot/backends/image_conversion.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Classes and functions for the ImageConversionBackend backends."""

from typing import Any
from typing import Dict
from typing import List
from typing import Type
Expand All @@ -22,7 +23,7 @@ class ImageConversionError(ValueError): # noqa D101
class ImageConversionBackend:
"""Classes the ImageConversionBackend backend."""

def __init__(self, backend: str = "poppler", use_fallback: bool = True) -> None:
def __init__(self, backend: Any = "poppler", use_fallback: bool = True) -> None:
"""Initialize the conversion backend .

Parameters
Expand All @@ -37,15 +38,70 @@ def __init__(self, backend: str = "poppler", use_fallback: bool = True) -> None:
ValueError
Raise an error if the backend is not supported.
"""
if backend not in BACKENDS.keys():
raise ValueError(f"Image conversion backend {backend!r} not supported")

self.backend: str = backend
self.backend: ConversionBackend = self.get_backend(backend)
self.use_fallback: bool = use_fallback
self.fallbacks: List[str] = list(
filter(lambda x: x != backend, BACKENDS.keys())
filter(lambda x: isinstance(backend, str) and x != backend, BACKENDS.keys())
)

def get_backend(self, backend):
"""Retrieve the specified backend for processing.

This method checks if the provided backend is a string representing
a known backend or an object implementing a 'convert' method. It
returns an instance of the backend if valid.

Parameters
----------
backend : str or object
The backend to retrieve. This can be:
- A string ('poppler' or 'ghostscript') corresponding to a pre-defined backend.
- An object that must implement a 'convert' method.

Returns
-------
object
An instance of the specified backend.

Raises
------
NotImplementedError
If the backend is a string that is not recognized or if it is an
object that does not implement the 'convert' method.

Examples
--------
>> backend_instance = get_backend('poppler')
>> backend_instance = get_backend(my_custom_backend)

Notes
-----
The valid backends are defined in the BACKENDS dictionary. The
method verifies the backend type and raises exceptions for
unsupported backends.
"""

def implements_convert():
methods = [
method for method in dir(backend) if method.startswith("__") is False
]
return "convert" in methods

if isinstance(backend, str):
if backend not in BACKENDS.keys():
raise NotImplementedError(
f"Unknown backend {backend!r} specified. Please use either 'poppler' or 'ghostscript'."
)

return BACKENDS[backend]()
else:
if not implements_convert():
raise NotImplementedError(
f"{backend!r} must implement a 'convert' method"
)

return backend

def convert(self, pdf_path: str, png_path: str) -> None:
"""Convert PDF to png_path.

Expand All @@ -64,8 +120,7 @@ def convert(self, pdf_path: str, png_path: str) -> None:
[description]
"""
try:
converter = BACKENDS[self.backend]()
converter.convert(pdf_path, png_path)
self.backend.convert(pdf_path, png_path)
except Exception as f:
if self.use_fallback:
for fallback in self.fallbacks:
Expand Down
25 changes: 0 additions & 25 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from typing import Any

from ..backends import ImageConversionBackend
from ..backends.image_conversion import BACKENDS
from ..image_processing import adaptive_threshold
from ..image_processing import find_contours
from ..image_processing import find_joints
Expand Down Expand Up @@ -119,34 +118,10 @@ def __init__(
self.iterations = iterations
self.resolution = resolution
self.use_fallback = use_fallback
self.backend = Lattice._get_backend(backend)
self.icb = ImageConversionBackend(use_fallback=use_fallback, backend=backend)
self.image_path = None
self.pdf_image = None

@staticmethod
def _get_backend(backend):
def implements_convert():
methods = [
method for method in dir(backend) if method.startswith("__") is False
]
return "convert" in methods

if isinstance(backend, str):
if backend not in BACKENDS.keys():
raise NotImplementedError(
f"Unknown backend {backend!r} specified. Please use either 'poppler' or 'ghostscript'."
)

return BACKENDS[backend]()
else:
if not implements_convert():
raise NotImplementedError(
f"{backend!r} must implement a 'convert' method"
)

return backend

@staticmethod
def _shift_index(
table: Any, r_idx: int, c_idx: int, direction: str
Expand Down
43 changes: 43 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pandas.testing import assert_frame_equal

import camelot
from camelot.backends.ghostscript_backend import GhostscriptBackend
from camelot.core import Table
from camelot.core import TableList
from camelot.io import PDFHandler
Expand Down Expand Up @@ -52,6 +53,15 @@ def test_repr_ghostscript(testdir):
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


@skip_on_windows
def test_repr_ghostscript_custom_backend(testdir):
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


@skip_pdftopng
def test_url_poppler():
url = "https://pypdf-table-extraction.readthedocs.io/en/latest/_static/pdf/foo.pdf"
Expand All @@ -70,6 +80,15 @@ def test_url_ghostscript(testdir):
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


@skip_on_windows
def test_url_ghostscript_custom_backend(testdir):
url = "https://pypdf-table-extraction.readthedocs.io/en/latest/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend=GhostscriptBackend())
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


@skip_pdftopng
def test_pages_poppler():
url = "https://pypdf-table-extraction.readthedocs.io/en/latest/_static/pdf/foo.pdf"
Expand Down Expand Up @@ -112,6 +131,30 @@ def test_pages_ghostscript():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


@skip_on_windows
def test_pages_ghostscript_custom_backend():
url = "https://pypdf-table-extraction.readthedocs.io/en/latest/_static/pdf/foo.pdf"
custom_backend = GhostscriptBackend()
tables = camelot.read_pdf(url, backend=custom_backend, use_fallback=False)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(
url, pages="1-end", backend=custom_backend, use_fallback=False
)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(
url, pages="all", backend=custom_backend, use_fallback=False
)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


def test_table_order():
def _make_table(page, order):
t = Table([], [])
Expand Down
7 changes: 6 additions & 1 deletion tests/test_image_conversion_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def convert(self, pdf_path, png_path):
def test_poppler_backend_error_when_no_use_fallback(patch_backends):
backend = ImageConversionBackend(backend="poppler", use_fallback=False)

message = "Image conversion failed with image conversion backend 'poppler'"
message = r"Image conversion failed with image conversion backend.+Poppler"
with pytest.raises(ValueError, match=message):
backend.convert("foo", "bar")

Expand All @@ -57,3 +57,8 @@ def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
message = "Image conversion failed with image conversion backend 'poppler'\n error: Image conversion failed"
with pytest.raises(ValueError, match=message):
backend.convert("foo", "bar")


@pytest.mark.xfail
def test_invalid_backend():
ImageConversionBackend(backend="invalid_backend")