Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.

Commit 23770f1

Browse files
committed
added pretty displays of results
1 parent 72b4f16 commit 23770f1

File tree

1 file changed

+131
-24
lines changed

1 file changed

+131
-24
lines changed

examples/camelot_quick_start_notebook.ipynb

Lines changed: 131 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,89 @@
2727
},
2828
{
2929
"cell_type": "code",
30-
"execution_count": null,
30+
"execution_count": 1,
3131
"metadata": {
3232
"collapsed": true,
3333
"id": "9xfD4-FXfgFO"
3434
},
35-
"outputs": [],
35+
"outputs": [
36+
{
37+
"name": "stdout",
38+
"output_type": "stream",
39+
"text": [
40+
"Collecting pypdf-table-extraction\n",
41+
" Downloading pypdf_table_extraction-0.0.1-py3-none-any.whl.metadata (7.4 kB)\n",
42+
"Collecting chardet<6.0.0,>=5.1.0 (from pypdf-table-extraction)\n",
43+
" Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)\n",
44+
"Collecting click>=8.0.1 (from pypdf-table-extraction)\n",
45+
" Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)\n",
46+
"Collecting numpy<2.0.0,>=1.24.2 (from pypdf-table-extraction)\n",
47+
" Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n",
48+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.0/61.0 kB\u001b[0m \u001b[31m474.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
49+
"\u001b[?25hCollecting openpyxl<4.0.0,>=3.1.0 (from pypdf-table-extraction)\n",
50+
" Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)\n",
51+
"Collecting pandas<2.0.0,>=1.5.3 (from pypdf-table-extraction)\n",
52+
" Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
53+
"Collecting pdfminer-six<20221106,>=20221105 (from pypdf-table-extraction)\n",
54+
" Downloading pdfminer.six-20221105-py3-none-any.whl.metadata (4.0 kB)\n",
55+
"Collecting pypdf<4.0.0,>=3.4.0 (from pypdf-table-extraction)\n",
56+
" Downloading pypdf-3.17.4-py3-none-any.whl.metadata (7.5 kB)\n",
57+
"Collecting tabulate<0.10.0,>=0.9.0 (from pypdf-table-extraction)\n",
58+
" Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)\n",
59+
"Collecting et-xmlfile (from openpyxl<4.0.0,>=3.1.0->pypdf-table-extraction)\n",
60+
" Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)\n",
61+
"Requirement already satisfied: python-dateutil>=2.8.1 in /home/codespace/.local/lib/python3.10/site-packages (from pandas<2.0.0,>=1.5.3->pypdf-table-extraction) (2.9.0.post0)\n",
62+
"Requirement already satisfied: pytz>=2020.1 in /home/codespace/.local/lib/python3.10/site-packages (from pandas<2.0.0,>=1.5.3->pypdf-table-extraction) (2024.1)\n",
63+
"Requirement already satisfied: charset-normalizer>=2.0.0 in /home/codespace/.local/lib/python3.10/site-packages (from pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (3.3.2)\n",
64+
"Requirement already satisfied: cryptography>=36.0.0 in /usr/local/python/3.10.13/lib/python3.10/site-packages (from pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (43.0.0)\n",
65+
"Requirement already satisfied: cffi>=1.12 in /home/codespace/.local/lib/python3.10/site-packages (from cryptography>=36.0.0->pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (1.16.0)\n",
66+
"Requirement already satisfied: six>=1.5 in /home/codespace/.local/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas<2.0.0,>=1.5.3->pypdf-table-extraction) (1.16.0)\n",
67+
"Requirement already satisfied: pycparser in /home/codespace/.local/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (2.22)\n",
68+
"Downloading pypdf_table_extraction-0.0.1-py3-none-any.whl (40 kB)\n",
69+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.7/40.7 kB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
70+
"\u001b[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)\n",
71+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.4/199.4 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
72+
"\u001b[?25hDownloading click-8.1.7-py3-none-any.whl (97 kB)\n",
73+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.9/97.9 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
74+
"\u001b[?25hDownloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
75+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.2/18.2 MB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
76+
"\u001b[?25hDownloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)\n",
77+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.9/250.9 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
78+
"\u001b[?25hDownloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n",
79+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.1/12.1 MB\u001b[0m \u001b[31m55.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
80+
"\u001b[?25hDownloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)\n",
81+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m31.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
82+
"\u001b[?25hDownloading pypdf-3.17.4-py3-none-any.whl (278 kB)\n",
83+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.2/278.2 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
84+
"\u001b[?25hDownloading tabulate-0.9.0-py3-none-any.whl (35 kB)\n",
85+
"Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n",
86+
"Installing collected packages: tabulate, pypdf, numpy, et-xmlfile, click, chardet, pandas, openpyxl, pdfminer-six, pypdf-table-extraction\n",
87+
" Attempting uninstall: numpy\n",
88+
" Found existing installation: numpy 2.0.0\n",
89+
" Uninstalling numpy-2.0.0:\n",
90+
" Successfully uninstalled numpy-2.0.0\n",
91+
" Attempting uninstall: pandas\n",
92+
" Found existing installation: pandas 2.2.2\n",
93+
" Uninstalling pandas-2.2.2:\n",
94+
" Successfully uninstalled pandas-2.2.2\n",
95+
"Successfully installed chardet-5.2.0 click-8.1.7 et-xmlfile-1.1.0 numpy-1.26.4 openpyxl-3.1.5 pandas-1.5.3 pdfminer-six-20221105 pypdf-3.17.4 pypdf-table-extraction-0.0.1 tabulate-0.9.0\n",
96+
"\n",
97+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
98+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
99+
"Collecting ghostscript\n",
100+
" Downloading ghostscript-0.7-py2.py3-none-any.whl.metadata (4.4 kB)\n",
101+
"Requirement already satisfied: setuptools>=38.6.0 in /usr/local/python/3.10.13/lib/python3.10/site-packages (from ghostscript) (68.2.2)\n",
102+
"Downloading ghostscript-0.7-py2.py3-none-any.whl (25 kB)\n",
103+
"Installing collected packages: ghostscript\n",
104+
"Successfully installed ghostscript-0.7\n",
105+
"\n",
106+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
107+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
108+
"E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)\n",
109+
"E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?\n"
110+
]
111+
}
112+
],
36113
"source": [
37114
"# @title 🛠️ Install Requirements\n",
38115
"!pip install pypdf-table-extraction # contains camelot\n",
@@ -43,13 +120,27 @@
43120
},
44121
{
45122
"cell_type": "code",
46-
"execution_count": null,
123+
"execution_count": 2,
47124
"metadata": {
48125
"cellView": "form",
49126
"collapsed": true,
50127
"id": "x2gOsT54QO6f"
51128
},
52-
"outputs": [],
129+
"outputs": [
130+
{
131+
"ename": "PermissionError",
132+
"evalue": "[Errno 13] Permission denied: '/content'",
133+
"output_type": "error",
134+
"traceback": [
135+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
136+
"\u001b[0;31mPermissionError\u001b[0m Traceback (most recent call last)",
137+
"Cell \u001b[0;32mIn[2], line 24\u001b[0m\n\u001b[1;32m 21\u001b[0m delete_directory(sample_data_dir)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# Create the necessary directories\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmakedirs\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/content/output\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexist_ok\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 25\u001b[0m os\u001b[38;5;241m.\u001b[39mmakedirs(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/content/sample_pdfs\u001b[39m\u001b[38;5;124m'\u001b[39m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# Define input and output directories\u001b[39;00m\n",
138+
"File \u001b[0;32m~/.python/current/lib/python3.10/os.py:215\u001b[0m, in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m head \u001b[38;5;129;01mand\u001b[39;00m tail \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m path\u001b[38;5;241m.\u001b[39mexists(head):\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 215\u001b[0m \u001b[43mmakedirs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhead\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexist_ok\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexist_ok\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileExistsError\u001b[39;00m:\n\u001b[1;32m 217\u001b[0m \u001b[38;5;66;03m# Defeats race condition when another thread created the path\u001b[39;00m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n",
139+
"File \u001b[0;32m~/.python/current/lib/python3.10/os.py:225\u001b[0m, in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 225\u001b[0m \u001b[43mmkdir\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 227\u001b[0m \u001b[38;5;66;03m# Cannot rely on checking for EEXIST, since the operating system\u001b[39;00m\n\u001b[1;32m 228\u001b[0m \u001b[38;5;66;03m# could give priority to other errors like EACCES or EROFS\u001b[39;00m\n\u001b[1;32m 229\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m exist_ok \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m path\u001b[38;5;241m.\u001b[39misdir(name):\n",
140+
"\u001b[0;31mPermissionError\u001b[0m: [Errno 13] Permission denied: '/content'"
141+
]
142+
}
143+
],
53144
"source": [
54145
"# @title 📂 Create necessary directories and delete `sample_data` if exists\n",
55146
"\n",
@@ -198,13 +289,13 @@
198289
},
199290
"outputs": [],
200291
"source": [
201-
"# @title ⚙️ Core - Simple Tables (Strict Parameters)\n",
202-
"\n",
203292
"import ghostscript\n",
204293
"import camelot\n",
205294
"import logging\n",
295+
"import pandas as pd\n",
206296
"from pathlib import Path\n",
207-
"from pypdf import PdfReader # Ensure correct import from the updated library\n",
297+
"from pypdf import PdfReader\n",
298+
"from IPython.display import display\n",
208299
"\n",
209300
"# Set up logging\n",
210301
"logging.getLogger(\"camelot\").setLevel(logging.DEBUG)\n",
@@ -224,7 +315,7 @@
224315
" logging.error(f\"Failed to open PDF {pdf_file.name} with PdfReader: {e}\")\n",
225316
" return\n",
226317
"\n",
227-
" # Read tables from the PDF using pypdf_table_extraction\n",
318+
" # Read tables from the PDF using camelot\n",
228319
" try:\n",
229320
" tables = camelot.read_pdf(str(pdf_file))\n",
230321
" except Exception as e:\n",
@@ -241,27 +332,29 @@
241332
" pdf_output_dir = output_dir / pdf_file.stem\n",
242333
" pdf_output_dir.mkdir(exist_ok=True)\n",
243334
"\n",
244-
" # Export all tables to a single CSV file\n",
245-
" try:\n",
246-
" tables.export(str(pdf_output_dir / f\"{pdf_file.stem}.csv\"), f='csv', compress=False)\n",
247-
" except Exception as e:\n",
248-
" print(f\"Failed to export tables from {pdf_file.name}: {e}\")\n",
249-
" logging.error(f\"Failed to export tables from {pdf_file.name}: {e}\")\n",
250-
" return\n",
251-
"\n",
252335
" # Process individual tables\n",
253336
" for i, table in enumerate(tables):\n",
254337
" try:\n",
338+
" # Convert table to pandas DataFrame\n",
339+
" df = table.df\n",
340+
"\n",
341+
" # Display the DataFrame\n",
342+
" print(f\"\\nTable {i+1} from {pdf_file.name}:\")\n",
343+
" display(df)\n",
344+
"\n",
255345
" # Save individual table to CSV\n",
256-
" table.to_csv(str(pdf_output_dir / f\"{pdf_file.stem}_table_{i+1}.csv\"))\n",
346+
" csv_path = pdf_output_dir / f\"{pdf_file.stem}_table_{i+1}.csv\"\n",
347+
" df.to_csv(csv_path, index=False)\n",
348+
" print(f\"Saved to {csv_path}\")\n",
349+
"\n",
257350
" # Log parsing report for each table\n",
258-
" print(f\"Table {i+1} Parsing Report:\")\n",
351+
" print(f\"\\nTable {i+1} Parsing Report:\")\n",
259352
" logging.info(f\"Table {i+1} Parsing Report:\")\n",
260353
" print(table.parsing_report)\n",
261354
" logging.info(table.parsing_report)\n",
262355
" except Exception as e:\n",
263-
" print(f\"Failed to save or log table {i+1} from {pdf_file.name}: {e}\")\n",
264-
" logging.error(f\"Failed to save or log table {i+1} from {pdf_file.name}: {e}\")\n",
356+
" print(f\"Failed to process or save table {i+1} from {pdf_file.name}: {e}\")\n",
357+
" logging.error(f\"Failed to process or save table {i+1} from {pdf_file.name}: {e}\")\n",
265358
"\n",
266359
"# Define input_dir and output_dir\n",
267360
"input_dir = Path('/content/sample_pdfs')\n",
@@ -298,11 +391,14 @@
298391
},
299392
"outputs": [],
300393
"source": [
301-
"# @title ⚙️ Core - Complex Tables (Loose Parameters)\n",
394+
"# @title ⚙️ Core - Complex Tables (Loose Parameters) with Clean Output\n",
302395
"\n",
303396
"import camelot\n",
304397
"import os\n",
305398
"from pathlib import Path\n",
399+
"import pandas as pd\n",
400+
"import numpy as np\n",
401+
"from tabulate import tabulate\n",
306402
"\n",
307403
"# Create output directory if it doesn't exist\n",
308404
"output_dir = Path('/content/output')\n",
@@ -311,7 +407,7 @@
311407
"# Process all PDF files in the input directory\n",
312408
"input_dir = Path('/content/sample_pdfs')\n",
313409
"for pdf_file in input_dir.glob('*.pdf'):\n",
314-
" print(f\"Processing {pdf_file.name}\")\n",
410+
" print(f\"\\nProcessing {pdf_file.name}\")\n",
315411
"\n",
316412
" # Using 'stream' flavor with table_areas\n",
317413
" tables_stream = camelot.read_pdf(str(pdf_file), flavor='stream', table_areas=['50,750,500,50'])\n",
@@ -334,12 +430,23 @@
334430
" tables.export(f'{output_base}.csv', f='csv', compress=True) # export all tables to CSV\n",
335431
" tables[0].to_csv(f'{output_base}_first_table.csv') # Save the first table to CSV\n",
336432
" df = tables[0].df # Get the first table as a pandas DataFrame\n",
433+
" \n",
337434
" print(f\"Tables found in {pdf_file.name}:\")\n",
338-
" print(df)\n",
435+
" \n",
436+
" # Clean up the DataFrame\n",
437+
" df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) # Remove leading/trailing whitespace\n",
438+
" df = df.replace(['', 'nan', 'NaN', 'NULL'], np.nan).dropna(how='all') # Remove empty rows\n",
439+
" df = df.fillna('') # Replace NaN with empty string for display\n",
440+
" df = df.reset_index(drop=True) # Reset index after dropping rows\n",
441+
" \n",
442+
" # Display the clean DataFrame\n",
443+
" print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))\n",
444+
" \n",
445+
" print(f\"\\nShape of the DataFrame: {df.shape}\")\n",
339446
" else:\n",
340447
" print(f\"No tables found in {pdf_file.name}\")\n",
341448
"\n",
342-
"print(\"Processing complete. Check the output directory for results.\")"
449+
"print(\"\\nProcessing complete. Check the output directory for results.\")"
343450
]
344451
},
345452
{

0 commit comments

Comments
 (0)