py-pdf
diff --git a/‎examples/camelot_quick_start_notebook.ipynb‎
Lines changed: 131 additions & 24 deletions b/‎examples/camelot_quick_start_notebook.ipynb‎
Lines changed: 131 additions & 24 deletions
@@ -27,12 +27,89 @@
  },
  {
  "cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
  "metadata": {
  "collapsed": true,
  "id": "9xfD4-FXfgFO"
  },
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Collecting pypdf-table-extraction\n",
+ " Downloading pypdf_table_extraction-0.0.1-py3-none-any.whl.metadata (7.4 kB)\n",
+ "Collecting chardet<6.0.0,>=5.1.0 (from pypdf-table-extraction)\n",
+ " Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)\n",
+ "Collecting click>=8.0.1 (from pypdf-table-extraction)\n",
+ " Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)\n",
+ "Collecting numpy<2.0.0,>=1.24.2 (from pypdf-table-extraction)\n",
+ " Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.0/61.0 kB\u001b[0m \u001b[31m474.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+ "\u001b[?25hCollecting openpyxl<4.0.0,>=3.1.0 (from pypdf-table-extraction)\n",
+ " Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)\n",
+ "Collecting pandas<2.0.0,>=1.5.3 (from pypdf-table-extraction)\n",
+ " Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
+ "Collecting pdfminer-six<20221106,>=20221105 (from pypdf-table-extraction)\n",
+ " Downloading pdfminer.six-20221105-py3-none-any.whl.metadata (4.0 kB)\n",
+ "Collecting pypdf<4.0.0,>=3.4.0 (from pypdf-table-extraction)\n",
+ " Downloading pypdf-3.17.4-py3-none-any.whl.metadata (7.5 kB)\n",
+ "Collecting tabulate<0.10.0,>=0.9.0 (from pypdf-table-extraction)\n",
+ " Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)\n",
+ "Collecting et-xmlfile (from openpyxl<4.0.0,>=3.1.0->pypdf-table-extraction)\n",
+ " Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.1 in /home/codespace/.local/lib/python3.10/site-packages (from pandas<2.0.0,>=1.5.3->pypdf-table-extraction) (2.9.0.post0)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /home/codespace/.local/lib/python3.10/site-packages (from pandas<2.0.0,>=1.5.3->pypdf-table-extraction) (2024.1)\n",
+ "Requirement already satisfied: charset-normalizer>=2.0.0 in /home/codespace/.local/lib/python3.10/site-packages (from pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (3.3.2)\n",
+ "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/python/3.10.13/lib/python3.10/site-packages (from pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (43.0.0)\n",
+ "Requirement already satisfied: cffi>=1.12 in /home/codespace/.local/lib/python3.10/site-packages (from cryptography>=36.0.0->pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (1.16.0)\n",
+ "Requirement already satisfied: six>=1.5 in /home/codespace/.local/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas<2.0.0,>=1.5.3->pypdf-table-extraction) (1.16.0)\n",
+ "Requirement already satisfied: pycparser in /home/codespace/.local/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer-six<20221106,>=20221105->pypdf-table-extraction) (2.22)\n",
+ "Downloading pypdf_table_extraction-0.0.1-py3-none-any.whl (40 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.7/40.7 kB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.4/199.4 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading click-8.1.7-py3-none-any.whl (97 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.9/97.9 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.2/18.2 MB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+ "\u001b[?25hDownloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.9/250.9 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.1/12.1 MB\u001b[0m \u001b[31m55.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
+ "\u001b[?25hDownloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m31.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+ "\u001b[?25hDownloading pypdf-3.17.4-py3-none-any.whl (278 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.2/278.2 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
+ "\u001b[?25hDownloading tabulate-0.9.0-py3-none-any.whl (35 kB)\n",
+ "Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n",
+ "Installing collected packages: tabulate, pypdf, numpy, et-xmlfile, click, chardet, pandas, openpyxl, pdfminer-six, pypdf-table-extraction\n",
+ " Attempting uninstall: numpy\n",
+ " Found existing installation: numpy 2.0.0\n",
+ " Uninstalling numpy-2.0.0:\n",
+ " Successfully uninstalled numpy-2.0.0\n",
+ " Attempting uninstall: pandas\n",
+ " Found existing installation: pandas 2.2.2\n",
+ " Uninstalling pandas-2.2.2:\n",
+ " Successfully uninstalled pandas-2.2.2\n",
+ "Successfully installed chardet-5.2.0 click-8.1.7 et-xmlfile-1.1.0 numpy-1.26.4 openpyxl-3.1.5 pandas-1.5.3 pdfminer-six-20221105 pypdf-3.17.4 pypdf-table-extraction-0.0.1 tabulate-0.9.0\n",
+ "\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
+ "Collecting ghostscript\n",
+ " Downloading ghostscript-0.7-py2.py3-none-any.whl.metadata (4.4 kB)\n",
+ "Requirement already satisfied: setuptools>=38.6.0 in /usr/local/python/3.10.13/lib/python3.10/site-packages (from ghostscript) (68.2.2)\n",
+ "Downloading ghostscript-0.7-py2.py3-none-any.whl (25 kB)\n",
+ "Installing collected packages: ghostscript\n",
+ "Successfully installed ghostscript-0.7\n",
+ "\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
+ "E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)\n",
+ "E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?\n"
+ ]
+ }
+ ],
  "source": [
  "# @title 🛠️ Install Requirements\n",
  "!pip install pypdf-table-extraction # contains camelot\n",
@@ -43,13 +120,27 @@
  },
  {
  "cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
  "metadata": {
  "cellView": "form",
  "collapsed": true,
  "id": "x2gOsT54QO6f"
  },
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "PermissionError",
+ "evalue": "[Errno 13] Permission denied: '/content'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mPermissionError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[2], line 24\u001b[0m\n\u001b[1;32m 21\u001b[0m delete_directory(sample_data_dir)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# Create the necessary directories\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmakedirs\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/content/output\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexist_ok\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 25\u001b[0m os\u001b[38;5;241m.\u001b[39mmakedirs(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/content/sample_pdfs\u001b[39m\u001b[38;5;124m'\u001b[39m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# Define input and output directories\u001b[39;00m\n",
+ "File \u001b[0;32m~/.python/current/lib/python3.10/os.py:215\u001b[0m, in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m head \u001b[38;5;129;01mand\u001b[39;00m tail \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m path\u001b[38;5;241m.\u001b[39mexists(head):\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 215\u001b[0m \u001b[43mmakedirs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhead\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexist_ok\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexist_ok\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileExistsError\u001b[39;00m:\n\u001b[1;32m 217\u001b[0m \u001b[38;5;66;03m# Defeats race condition when another thread created the path\u001b[39;00m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n",
+ "File \u001b[0;32m~/.python/current/lib/python3.10/os.py:225\u001b[0m, in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 225\u001b[0m \u001b[43mmkdir\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 227\u001b[0m \u001b[38;5;66;03m# Cannot rely on checking for EEXIST, since the operating system\u001b[39;00m\n\u001b[1;32m 228\u001b[0m \u001b[38;5;66;03m# could give priority to other errors like EACCES or EROFS\u001b[39;00m\n\u001b[1;32m 229\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m exist_ok \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m path\u001b[38;5;241m.\u001b[39misdir(name):\n",
+ "\u001b[0;31mPermissionError\u001b[0m: [Errno 13] Permission denied: '/content'"
+ ]
+ }
+ ],
  "source": [
  "# @title 📂 Create necessary directories and delete `sample_data` if exists\n",
  "\n",
@@ -198,13 +289,13 @@
  },
  "outputs": [],
  "source": [
- "# @title ⚙️ Core - Simple Tables (Strict Parameters)\n",
- "\n",
  "import ghostscript\n",
  "import camelot\n",
  "import logging\n",
+ "import pandas as pd\n",
  "from pathlib import Path\n",
- "from pypdf import PdfReader # Ensure correct import from the updated library\n",
+ "from pypdf import PdfReader\n",
+ "from IPython.display import display\n",
  "\n",
  "# Set up logging\n",
  "logging.getLogger(\"camelot\").setLevel(logging.DEBUG)\n",
@@ -224,7 +315,7 @@
  " logging.error(f\"Failed to open PDF {pdf_file.name} with PdfReader: {e}\")\n",
  " return\n",
  "\n",
- " # Read tables from the PDF using pypdf_table_extraction\n",
+ " # Read tables from the PDF using camelot\n",
  " try:\n",
  " tables = camelot.read_pdf(str(pdf_file))\n",
  " except Exception as e:\n",
@@ -241,27 +332,29 @@
  " pdf_output_dir = output_dir / pdf_file.stem\n",
  " pdf_output_dir.mkdir(exist_ok=True)\n",
  "\n",
- " # Export all tables to a single CSV file\n",
- " try:\n",
- " tables.export(str(pdf_output_dir / f\"{pdf_file.stem}.csv\"), f='csv', compress=False)\n",
- " except Exception as e:\n",
- " print(f\"Failed to export tables from {pdf_file.name}: {e}\")\n",
- " logging.error(f\"Failed to export tables from {pdf_file.name}: {e}\")\n",
- " return\n",
- "\n",
  " # Process individual tables\n",
  " for i, table in enumerate(tables):\n",
  " try:\n",
+ " # Convert table to pandas DataFrame\n",
+ " df = table.df\n",
+ "\n",
+ " # Display the DataFrame\n",
+ " print(f\"\\nTable {i+1} from {pdf_file.name}:\")\n",
+ " display(df)\n",
+ "\n",
  " # Save individual table to CSV\n",
- " table.to_csv(str(pdf_output_dir / f\"{pdf_file.stem}_table_{i+1}.csv\"))\n",
+ " csv_path = pdf_output_dir / f\"{pdf_file.stem}_table_{i+1}.csv\"\n",
+ " df.to_csv(csv_path, index=False)\n",
+ " print(f\"Saved to {csv_path}\")\n",
+ "\n",
  " # Log parsing report for each table\n",
- " print(f\"Table {i+1} Parsing Report:\")\n",
+ " print(f\"\\nTable {i+1} Parsing Report:\")\n",
  " logging.info(f\"Table {i+1} Parsing Report:\")\n",
  " print(table.parsing_report)\n",
  " logging.info(table.parsing_report)\n",
  " except Exception as e:\n",
- " print(f\"Failed to save or log table {i+1} from {pdf_file.name}: {e}\")\n",
- " logging.error(f\"Failed to save or log table {i+1} from {pdf_file.name}: {e}\")\n",
+ " print(f\"Failed to process or save table {i+1} from {pdf_file.name}: {e}\")\n",
+ " logging.error(f\"Failed to process or save table {i+1} from {pdf_file.name}: {e}\")\n",
  "\n",
  "# Define input_dir and output_dir\n",
  "input_dir = Path('/content/sample_pdfs')\n",
@@ -298,11 +391,14 @@
  },
  "outputs": [],
  "source": [
- "# @title ⚙️ Core - Complex Tables (Loose Parameters)\n",
+ "# @title ⚙️ Core - Complex Tables (Loose Parameters) with Clean Output\n",
  "\n",
  "import camelot\n",
  "import os\n",
  "from pathlib import Path\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from tabulate import tabulate\n",
  "\n",
  "# Create output directory if it doesn't exist\n",
  "output_dir = Path('/content/output')\n",
@@ -311,7 +407,7 @@
  "# Process all PDF files in the input directory\n",
  "input_dir = Path('/content/sample_pdfs')\n",
  "for pdf_file in input_dir.glob('*.pdf'):\n",
- " print(f\"Processing {pdf_file.name}\")\n",
+ " print(f\"\\nProcessing {pdf_file.name}\")\n",
  "\n",
  " # Using 'stream' flavor with table_areas\n",
  " tables_stream = camelot.read_pdf(str(pdf_file), flavor='stream', table_areas=['50,750,500,50'])\n",
@@ -334,12 +430,23 @@
  " tables.export(f'{output_base}.csv', f='csv', compress=True) # export all tables to CSV\n",
  " tables[0].to_csv(f'{output_base}_first_table.csv') # Save the first table to CSV\n",
  " df = tables[0].df # Get the first table as a pandas DataFrame\n",
+ " \n",
  " print(f\"Tables found in {pdf_file.name}:\")\n",
- " print(df)\n",
+ " \n",
+ " # Clean up the DataFrame\n",
+ " df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) # Remove leading/trailing whitespace\n",
+ " df = df.replace(['', 'nan', 'NaN', 'NULL'], np.nan).dropna(how='all') # Remove empty rows\n",
+ " df = df.fillna('') # Replace NaN with empty string for display\n",
+ " df = df.reset_index(drop=True) # Reset index after dropping rows\n",
+ " \n",
+ " # Display the clean DataFrame\n",
+ " print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))\n",
+ " \n",
+ " print(f\"\\nShape of the DataFrame: {df.shape}\")\n",
  " else:\n",
  " print(f\"No tables found in {pdf_file.name}\")\n",
  "\n",
- "print(\"Processing complete. Check the output directory for results.\")"
+ "print(\"\\nProcessing complete. Check the output directory for results.\")"
  ]
  },
  {