Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 0 additions & 15 deletions camelot/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

from .backends import ImageConversionBackend
from .utils import build_file_path_in_temp_dir
from .utils import compute_whitespace
from .utils import get_index_closest_point
from .utils import get_textline_coords

Expand Down Expand Up @@ -611,20 +610,6 @@ def parsing_report(self):
}
return report

def record_metadata(self, parser):
"""Record data about the origin of the table."""
self.flavor = parser.id
self.filename = parser.filename
self.debug_info = parser.debug_info
if parser.copy_text is not None:
self.copy_spanning_text(parser.copy_text)
data = self.data
self.df = pd.DataFrame(data)
self.shape = self.df.shape

self.whitespace = compute_whitespace(data)
self.pdf_size = (parser.pdf_width, parser.pdf_height)

def get_pdf_image(self):
"""Compute pdf image and cache it."""
if self._image is None:
Expand Down
3 changes: 0 additions & 3 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import multiprocessing as mp
import os
import sys
from pathlib import Path

from pypdf import PdfReader
Expand Down Expand Up @@ -71,8 +70,6 @@ def __init__(
self.password = "" # noqa: S105
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages)

def _get_pages(self, pages):
Expand Down
1 change: 0 additions & 1 deletion camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def _nurminen_table_detection(self, textlines):
Assumes that tables are situated relatively far apart
vertically.
"""
# TODO: add support for arabic text #141
# sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0))
textedges = TextEdges(edge_tol=self.edge_tol)
Expand Down
26 changes: 0 additions & 26 deletions camelot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,32 +747,6 @@ def find_columns_boundaries(tls, min_gap=1.0):
return cols_bounds


def find_rows_boundaries(tls, min_gap=1.0):
"""Make a list of disjunct rows boundaries for a list of text objects.

Parameters
----------
tls : list of PDFMiner text object.

min_gap : minimum distance between rows. Any elements closer than
this threshold are merged together.

Returns
-------
boundaries : list
List y-coordinates for rows.
[(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
"""
rows_bounds = []
tls.sort(key=lambda tl: tl.y0)
for tl in tls:
if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
rows_bounds.append([tl.y0, tl.y1])
else:
rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
return rows_bounds


def boundaries_to_split_lines(boundaries):
"""Find split lines given a list of boundaries between rows or cols.

Expand Down
1 change: 0 additions & 1 deletion pypdf_table_extraction/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from camelot.utils import download_url # noqa F401
from camelot.utils import expand_bbox_with_textline # noqa F401
from camelot.utils import find_columns_boundaries # noqa F401
from camelot.utils import find_rows_boundaries # noqa F401
from camelot.utils import flag_font_size # noqa F401
from camelot.utils import flavor_to_kwargs # noqa F401
from camelot.utils import get_index_closest_point # noqa F401
Expand Down
Loading