|
27 | 27 | }, |
28 | 28 | { |
29 | 29 | "cell_type": "code", |
30 | | - "execution_count": null, |
| 30 | + "execution_count": 1, |
31 | 31 | "metadata": { |
32 | 32 | "collapsed": true, |
33 | 33 | "id": "9xfD4-FXfgFO" |
34 | 34 | }, |
35 | | - "outputs": [], |
| 35 | + "outputs": [ |
| 36 | + { |
| 37 | + "name": "stdout", |
| 38 | + "output_type": "stream", |
| 39 | + "text": [ |
| 40 | + "Collecting pypdf-table-extraction\n", |
| 41 | + " Downloading pypdf_table_extraction-0.0.1-py3-none-any.whl.metadata (7.4 kB)\n", |
| 42 | + "Collecting chardet<6.0.0,>=5.1.0 (from pypdf-table-extraction)\n", |
| 43 | + " Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)\n", |
| 44 | + "Collecting click>=8.0.1 (from pypdf-table-extraction)\n", |
| 45 | + " Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)\n", |
| 46 | + "Collecting numpy<2.0.0,>=1.24.2 (from pypdf-table-extraction)\n", |
| 47 | + " Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n", |
| 48 | + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.0/61.0 kB\u001b[0m \u001b[31m474.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", |
| 49 | + "\u001b[?25hCollecting openpyxl<4.0.0,>=3.1.0 (from pypdf-table-extraction)\n", |
| 50 | + " Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)\n", |
| 51 | + "Collecting pandas<2.0.0,>=1.5.3 (from pypdf-table-extraction)\n", |
| 52 | + " Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n", |
| 53 | + "Collecting pdfminer-six<20221106,>=20221105 (from pypdf-table-extraction)\n", |
| 54 | + " Downloading pdfminer.six-20221105-py3-none-any.whl.metadata (4.0 kB)\n", |
| 55 | + "Collecting pypdf<4.0.0,>=3.4.0 (from pypdf-table-extraction)\n", |
| 56 | + " Downloading pypdf-3.17.4-py3-none-any.whl.metadata (7.5 kB)\n", |
| 57 | + "Collecting tabulate<0.10.0,>=0.9.0 (from pypdf-table-extraction)\n", |
| 58 | + " Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)\n", |
| 59 | + "Collecting et-xmlfile (from openpyxl<4.0.0,>=3.1.0->pypdf-table-extraction)\n", |
| 60 | + " Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)\n", |
| 61 | + "Requirement already satisfied: python-dateutil>=2.8.1 in /home/codespace/.local/lib/python3.10/site-packages (from pandas<2.0.0,>=1.5.3->pypdf-table-extraction) (2.9.0.post0)\n", |
| 62 | + "Requirement already satisfied: pytz>=2020.1 in /home/codespace/.local/lib/python3.10/site-packages (from pandas<2.0.0,>=1.5.3->pypdf-table-extraction) (2024.1)\n", |
| 63 | + "Requirement already satisfied: charset-normalizer>=2.0.0 in /home/codespace/.local/lib/python3.10/site-packages (from pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (3.3.2)\n", |
| 64 | + "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/python/3.10.13/lib/python3.10/site-packages (from pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (43.0.0)\n", |
| 65 | + "Requirement already satisfied: cffi>=1.12 in /home/codespace/.local/lib/python3.10/site-packages (from cryptography>=36.0.0->pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (1.16.0)\n", |
| 66 | + "Requirement already satisfied: six>=1.5 in /home/codespace/.local/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas<2.0.0,>=1.5.3->pypdf-table-extraction) (1.16.0)\n", |
| 67 | + "Requirement already satisfied: pycparser in /home/codespace/.local/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (2.22)\n", |
| 68 | + "Downloading pypdf_table_extraction-0.0.1-py3-none-any.whl (40 kB)\n", |
| 69 | + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.7/40.7 kB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", |
| 70 | + "\u001b[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)\n", |
| 71 | + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.4/199.4 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", |
| 72 | + "\u001b[?25hDownloading click-8.1.7-py3-none-any.whl (97 kB)\n", |
| 73 | + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.9/97.9 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", |
| 74 | + "\u001b[?25hDownloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n", |
| 75 | + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.2/18.2 MB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", |
| 76 | + "\u001b[?25hDownloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)\n", |
| 77 | + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.9/250.9 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", |
| 78 | + "\u001b[?25hDownloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n", |
| 79 | + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.1/12.1 MB\u001b[0m \u001b[31m55.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n", |
| 80 | + "\u001b[?25hDownloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)\n", |
| 81 | + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m31.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", |
| 82 | + "\u001b[?25hDownloading pypdf-3.17.4-py3-none-any.whl (278 kB)\n", |
| 83 | + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.2/278.2 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n", |
| 84 | + "\u001b[?25hDownloading tabulate-0.9.0-py3-none-any.whl (35 kB)\n", |
| 85 | + "Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n", |
| 86 | + "Installing collected packages: tabulate, pypdf, numpy, et-xmlfile, click, chardet, pandas, openpyxl, pdfminer-six, pypdf-table-extraction\n", |
| 87 | + " Attempting uninstall: numpy\n", |
| 88 | + " Found existing installation: numpy 2.0.0\n", |
| 89 | + " Uninstalling numpy-2.0.0:\n", |
| 90 | + " Successfully uninstalled numpy-2.0.0\n", |
| 91 | + " Attempting uninstall: pandas\n", |
| 92 | + " Found existing installation: pandas 2.2.2\n", |
| 93 | + " Uninstalling pandas-2.2.2:\n", |
| 94 | + " Successfully uninstalled pandas-2.2.2\n", |
| 95 | + "Successfully installed chardet-5.2.0 click-8.1.7 et-xmlfile-1.1.0 numpy-1.26.4 openpyxl-3.1.5 pandas-1.5.3 pdfminer-six-20221105 pypdf-3.17.4 pypdf-table-extraction-0.0.1 tabulate-0.9.0\n", |
| 96 | + "\n", |
| 97 | + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", |
| 98 | + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n", |
| 99 | + "Collecting ghostscript\n", |
| 100 | + " Downloading ghostscript-0.7-py2.py3-none-any.whl.metadata (4.4 kB)\n", |
| 101 | + "Requirement already satisfied: setuptools>=38.6.0 in /usr/local/python/3.10.13/lib/python3.10/site-packages (from ghostscript) (68.2.2)\n", |
| 102 | + "Downloading ghostscript-0.7-py2.py3-none-any.whl (25 kB)\n", |
| 103 | + "Installing collected packages: ghostscript\n", |
| 104 | + "Successfully installed ghostscript-0.7\n", |
| 105 | + "\n", |
| 106 | + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", |
| 107 | + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n", |
| 108 | + "E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)\n", |
| 109 | + "E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?\n" |
| 110 | + ] |
| 111 | + } |
| 112 | + ], |
36 | 113 | "source": [ |
37 | 114 | "# @title 🛠️ Install Requirements\n", |
38 | 115 | "!pip install pypdf-table-extraction # contains camelot\n", |
|
43 | 120 | }, |
44 | 121 | { |
45 | 122 | "cell_type": "code", |
46 | | - "execution_count": null, |
| 123 | + "execution_count": 2, |
47 | 124 | "metadata": { |
48 | 125 | "cellView": "form", |
49 | 126 | "collapsed": true, |
50 | 127 | "id": "x2gOsT54QO6f" |
51 | 128 | }, |
52 | | - "outputs": [], |
| 129 | + "outputs": [ |
| 130 | + { |
| 131 | + "ename": "PermissionError", |
| 132 | + "evalue": "[Errno 13] Permission denied: '/content'", |
| 133 | + "output_type": "error", |
| 134 | + "traceback": [ |
| 135 | + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
| 136 | + "\u001b[0;31mPermissionError\u001b[0m Traceback (most recent call last)", |
| 137 | + "Cell \u001b[0;32mIn[2], line 24\u001b[0m\n\u001b[1;32m 21\u001b[0m delete_directory(sample_data_dir)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# Create the necessary directories\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmakedirs\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/content/output\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexist_ok\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 25\u001b[0m os\u001b[38;5;241m.\u001b[39mmakedirs(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/content/sample_pdfs\u001b[39m\u001b[38;5;124m'\u001b[39m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# Define input and output directories\u001b[39;00m\n", |
| 138 | + "File \u001b[0;32m~/.python/current/lib/python3.10/os.py:215\u001b[0m, in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m head \u001b[38;5;129;01mand\u001b[39;00m tail \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m path\u001b[38;5;241m.\u001b[39mexists(head):\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 215\u001b[0m \u001b[43mmakedirs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhead\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexist_ok\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexist_ok\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileExistsError\u001b[39;00m:\n\u001b[1;32m 217\u001b[0m \u001b[38;5;66;03m# Defeats race condition when another thread created the path\u001b[39;00m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n", |
| 139 | + "File \u001b[0;32m~/.python/current/lib/python3.10/os.py:225\u001b[0m, in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 225\u001b[0m \u001b[43mmkdir\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 227\u001b[0m \u001b[38;5;66;03m# Cannot rely on checking for EEXIST, since the operating system\u001b[39;00m\n\u001b[1;32m 228\u001b[0m \u001b[38;5;66;03m# could give priority to other errors like EACCES or EROFS\u001b[39;00m\n\u001b[1;32m 229\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m exist_ok \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m path\u001b[38;5;241m.\u001b[39misdir(name):\n", |
| 140 | + "\u001b[0;31mPermissionError\u001b[0m: [Errno 13] Permission denied: '/content'" |
| 141 | + ] |
| 142 | + } |
| 143 | + ], |
53 | 144 | "source": [ |
54 | 145 | "# @title 📂 Create necessary directories and delete `sample_data` if exists\n", |
55 | 146 | "\n", |
|
198 | 289 | }, |
199 | 290 | "outputs": [], |
200 | 291 | "source": [ |
201 | | - "# @title ⚙️ Core - Simple Tables (Strict Parameters)\n", |
202 | | - "\n", |
203 | 292 | "import ghostscript\n", |
204 | 293 | "import camelot\n", |
205 | 294 | "import logging\n", |
| 295 | + "import pandas as pd\n", |
206 | 296 | "from pathlib import Path\n", |
207 | | - "from pypdf import PdfReader # Ensure correct import from the updated library\n", |
| 297 | + "from pypdf import PdfReader\n", |
| 298 | + "from IPython.display import display\n", |
208 | 299 | "\n", |
209 | 300 | "# Set up logging\n", |
210 | 301 | "logging.getLogger(\"camelot\").setLevel(logging.DEBUG)\n", |
|
224 | 315 | " logging.error(f\"Failed to open PDF {pdf_file.name} with PdfReader: {e}\")\n", |
225 | 316 | " return\n", |
226 | 317 | "\n", |
227 | | - " # Read tables from the PDF using pypdf_table_extraction\n", |
| 318 | + " # Read tables from the PDF using camelot\n", |
228 | 319 | " try:\n", |
229 | 320 | " tables = camelot.read_pdf(str(pdf_file))\n", |
230 | 321 | " except Exception as e:\n", |
|
241 | 332 | " pdf_output_dir = output_dir / pdf_file.stem\n", |
242 | 333 | " pdf_output_dir.mkdir(exist_ok=True)\n", |
243 | 334 | "\n", |
244 | | - " # Export all tables to a single CSV file\n", |
245 | | - " try:\n", |
246 | | - " tables.export(str(pdf_output_dir / f\"{pdf_file.stem}.csv\"), f='csv', compress=False)\n", |
247 | | - " except Exception as e:\n", |
248 | | - " print(f\"Failed to export tables from {pdf_file.name}: {e}\")\n", |
249 | | - " logging.error(f\"Failed to export tables from {pdf_file.name}: {e}\")\n", |
250 | | - " return\n", |
251 | | - "\n", |
252 | 335 | " # Process individual tables\n", |
253 | 336 | " for i, table in enumerate(tables):\n", |
254 | 337 | " try:\n", |
| 338 | + " # Convert table to pandas DataFrame\n", |
| 339 | + " df = table.df\n", |
| 340 | + "\n", |
| 341 | + " # Display the DataFrame\n", |
| 342 | + " print(f\"\\nTable {i+1} from {pdf_file.name}:\")\n", |
| 343 | + " display(df)\n", |
| 344 | + "\n", |
255 | 345 | " # Save individual table to CSV\n", |
256 | | - " table.to_csv(str(pdf_output_dir / f\"{pdf_file.stem}_table_{i+1}.csv\"))\n", |
| 346 | + " csv_path = pdf_output_dir / f\"{pdf_file.stem}_table_{i+1}.csv\"\n", |
| 347 | + " df.to_csv(csv_path, index=False)\n", |
| 348 | + " print(f\"Saved to {csv_path}\")\n", |
| 349 | + "\n", |
257 | 350 | " # Log parsing report for each table\n", |
258 | | - " print(f\"Table {i+1} Parsing Report:\")\n", |
| 351 | + " print(f\"\\nTable {i+1} Parsing Report:\")\n", |
259 | 352 | " logging.info(f\"Table {i+1} Parsing Report:\")\n", |
260 | 353 | " print(table.parsing_report)\n", |
261 | 354 | " logging.info(table.parsing_report)\n", |
262 | 355 | " except Exception as e:\n", |
263 | | - " print(f\"Failed to save or log table {i+1} from {pdf_file.name}: {e}\")\n", |
264 | | - " logging.error(f\"Failed to save or log table {i+1} from {pdf_file.name}: {e}\")\n", |
| 356 | + " print(f\"Failed to process or save table {i+1} from {pdf_file.name}: {e}\")\n", |
| 357 | + " logging.error(f\"Failed to process or save table {i+1} from {pdf_file.name}: {e}\")\n", |
265 | 358 | "\n", |
266 | 359 | "# Define input_dir and output_dir\n", |
267 | 360 | "input_dir = Path('/content/sample_pdfs')\n", |
|
298 | 391 | }, |
299 | 392 | "outputs": [], |
300 | 393 | "source": [ |
301 | | - "# @title ⚙️ Core - Complex Tables (Loose Parameters)\n", |
| 394 | + "# @title ⚙️ Core - Complex Tables (Loose Parameters) with Clean Output\n", |
302 | 395 | "\n", |
303 | 396 | "import camelot\n", |
304 | 397 | "import os\n", |
305 | 398 | "from pathlib import Path\n", |
| 399 | + "import pandas as pd\n", |
| 400 | + "import numpy as np\n", |
| 401 | + "from tabulate import tabulate\n", |
306 | 402 | "\n", |
307 | 403 | "# Create output directory if it doesn't exist\n", |
308 | 404 | "output_dir = Path('/content/output')\n", |
|
311 | 407 | "# Process all PDF files in the input directory\n", |
312 | 408 | "input_dir = Path('/content/sample_pdfs')\n", |
313 | 409 | "for pdf_file in input_dir.glob('*.pdf'):\n", |
314 | | - " print(f\"Processing {pdf_file.name}\")\n", |
| 410 | + " print(f\"\\nProcessing {pdf_file.name}\")\n", |
315 | 411 | "\n", |
316 | 412 | " # Using 'stream' flavor with table_areas\n", |
317 | 413 | " tables_stream = camelot.read_pdf(str(pdf_file), flavor='stream', table_areas=['50,750,500,50'])\n", |
|
334 | 430 | " tables.export(f'{output_base}.csv', f='csv', compress=True) # export all tables to CSV\n", |
335 | 431 | " tables[0].to_csv(f'{output_base}_first_table.csv') # Save the first table to CSV\n", |
336 | 432 | " df = tables[0].df # Get the first table as a pandas DataFrame\n", |
| 433 | + " \n", |
337 | 434 | " print(f\"Tables found in {pdf_file.name}:\")\n", |
338 | | - " print(df)\n", |
| 435 | + " \n", |
| 436 | + " # Clean up the DataFrame\n", |
| 437 | + " df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) # Remove leading/trailing whitespace\n", |
| 438 | + " df = df.replace(['', 'nan', 'NaN', 'NULL'], np.nan).dropna(how='all') # Remove empty rows\n", |
| 439 | + " df = df.fillna('') # Replace NaN with empty string for display\n", |
| 440 | + " df = df.reset_index(drop=True) # Reset index after dropping rows\n", |
| 441 | + " \n", |
| 442 | + " # Display the clean DataFrame\n", |
| 443 | + " print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))\n", |
| 444 | + " \n", |
| 445 | + " print(f\"\\nShape of the DataFrame: {df.shape}\")\n", |
339 | 446 | " else:\n", |
340 | 447 | " print(f\"No tables found in {pdf_file.name}\")\n", |
341 | 448 | "\n", |
342 | | - "print(\"Processing complete. Check the output directory for results.\")" |
| 449 | + "print(\"\\nProcessing complete. Check the output directory for results.\")" |
343 | 450 | ] |
344 | 451 | }, |
345 | 452 | { |
|
0 commit comments