Skip to content

Commit d2825ea

Browse files
feat: add support for improved handling of jupyter notebooks (#105)
1 parent 551d09a commit d2825ea

File tree

8 files changed

+321
-9
lines changed

8 files changed

+321
-9
lines changed

src/gitingest/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from gitingest.clone import clone_repo
44
from gitingest.ingest import ingest
5-
from gitingest.ingest_from_query import ingest_from_query
5+
from gitingest.ingest_from_query import run_ingest_query
66
from gitingest.parse_query import parse_query
77

8-
__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"]
8+
__all__ = ["run_ingest_query", "clone_repo", "parse_query", "ingest"]

src/gitingest/ingest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from config import TMP_BASE_PATH
88
from gitingest.clone import CloneConfig, clone_repo
9-
from gitingest.ingest_from_query import ingest_from_query
9+
from gitingest.ingest_from_query import run_ingest_query
1010
from gitingest.parse_query import parse_query
1111

1212

@@ -75,7 +75,7 @@ def ingest(
7575
else:
7676
raise TypeError("clone_repo did not return a coroutine as expected.")
7777

78-
summary, tree, content = ingest_from_query(query)
78+
summary, tree, content = run_ingest_query(query)
7979

8080
if output is not None:
8181
with open(output, "w", encoding="utf-8") as f:

src/gitingest/ingest_from_query.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import tiktoken
88

99
from gitingest.exceptions import AlreadyVisitedError, MaxFileSizeReachedError, MaxFilesReachedError
10+
from gitingest.notebook_utils import process_notebook
1011

1112
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB
1213
MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal
@@ -158,7 +159,10 @@ def _read_file_content(file_path: Path) -> str:
158159
The content of the file, or an error message if the file could not be read.
159160
"""
160161
try:
161-
with file_path.open(encoding="utf-8", errors="ignore") as f:
162+
if file_path.suffix == ".ipynb":
163+
return process_notebook(file_path)
164+
165+
with open(file_path, encoding="utf-8", errors="ignore") as f:
162166
return f.read()
163167
except OSError as e:
164168
return f"Error reading file: {e}"
@@ -819,7 +823,7 @@ def _ingest_directory(path: Path, query: dict[str, Any]) -> tuple[str, str, str]
819823
return summary, tree, files_content
820824

821825

822-
def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]:
826+
def run_ingest_query(query: dict[str, Any]) -> tuple[str, str, str]:
823827
"""
824828
Main entry point for analyzing a codebase directory or single file.
825829

src/gitingest/notebook_utils.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
""" Utilities for processing Jupyter notebooks. """
2+
3+
import json
4+
import warnings
5+
from pathlib import Path
6+
from typing import Any
7+
8+
9+
def process_notebook(file: Path) -> str:
10+
"""
11+
Process a Jupyter notebook file and return an executable Python script as a string.
12+
13+
Parameters
14+
----------
15+
file : Path
16+
The path to the Jupyter notebook file.
17+
18+
Returns
19+
-------
20+
str
21+
The executable Python script as a string.
22+
23+
Raises
24+
------
25+
ValueError
26+
If an unexpected cell type is encountered.
27+
"""
28+
with file.open(encoding="utf-8") as f:
29+
notebook: dict[str, Any] = json.load(f)
30+
31+
# Check if the notebook contains worksheets
32+
if worksheets := notebook.get("worksheets"):
33+
# https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets
34+
# "The `worksheets` field is a list, but we have no UI to support multiple worksheets.
35+
# Our design has since shifted to heading-cell based structure, so we never intend to
36+
# support the multiple worksheet model. The worksheets list of lists shall be replaced
37+
# with a single list, called `cells`."
38+
warnings.warn("Worksheets are deprecated as of IPEP-17.", DeprecationWarning)
39+
40+
if len(worksheets) > 1:
41+
warnings.warn(
42+
"Multiple worksheets are not supported. Only the first worksheet will be processed.", UserWarning
43+
)
44+
45+
notebook = worksheets[0]
46+
47+
result = []
48+
49+
for cell in notebook["cells"]:
50+
cell_type = cell.get("cell_type")
51+
52+
# Validate cell type and handle unexpected types
53+
if cell_type not in ("markdown", "code", "raw"):
54+
raise ValueError(f"Unknown cell type: {cell_type}")
55+
56+
str_ = "".join(cell.get("source", []))
57+
if not str_:
58+
continue
59+
60+
# Convert Markdown and raw cells to multi-line comments
61+
if cell_type in ("markdown", "raw"):
62+
str_ = f'"""\n{str_}\n"""'
63+
64+
result.append(str_)
65+
66+
return "\n\n".join(result)

src/process_query.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE
1010
from gitingest.clone import CloneConfig, clone_repo
11-
from gitingest.ingest_from_query import ingest_from_query
11+
from gitingest.ingest_from_query import run_ingest_query
1212
from gitingest.parse_query import parse_query
1313
from server_utils import Colors, log_slider_to_size
1414

@@ -91,7 +91,7 @@ async def process_query(
9191
branch=query.get("branch"),
9292
)
9393
await clone_repo(clone_config)
94-
summary, tree, content = ingest_from_query(query)
94+
summary, tree, content = run_ingest_query(query)
9595
with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f:
9696
f.write(tree + "\n" + content)
9797
except Exception as e:

tests/conftest.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
""" This module contains fixtures for the tests. """
22

3+
import json
34
from pathlib import Path
45
from typing import Any
56

@@ -72,3 +73,18 @@ def temp_directory(tmp_path: Path) -> Path:
7273
(dir2 / "file_dir2.txt").write_text("Hello from dir2")
7374

7475
return test_dir
76+
77+
78+
@pytest.fixture
79+
def write_notebook(tmp_path: Path):
80+
"""
81+
A fixture that returns a helper function to write a .ipynb notebook file at runtime with given content.
82+
"""
83+
84+
def _write_notebook(name: str, content: dict[str, Any]) -> Path:
85+
notebook_path = tmp_path / name
86+
with notebook_path.open(mode="w", encoding="utf-8") as f:
87+
json.dump(content, f)
88+
return notebook_path
89+
90+
return _write_notebook

tests/test_ingest.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
from pathlib import Path
44
from typing import Any
5+
from unittest.mock import patch
56

6-
from gitingest.ingest_from_query import _extract_files_content, _scan_directory
7+
from gitingest.ingest_from_query import _extract_files_content, _read_file_content, _scan_directory
78

89

910
def test_scan_directory(temp_directory: Path, sample_query: dict[str, Any]) -> None:
@@ -37,6 +38,25 @@ def test_extract_files_content(temp_directory: Path, sample_query: dict[str, Any
3738
assert any("file_dir2.txt" in p for p in paths)
3839

3940

41+
def test_read_file_content_with_notebook(tmp_path: Path):
42+
notebook_path = tmp_path / "dummy_notebook.ipynb"
43+
notebook_path.write_text("{}", encoding="utf-8") # minimal JSON
44+
45+
# Patch the symbol as it is used in ingest_from_query
46+
with patch("gitingest.ingest_from_query.process_notebook") as mock_process:
47+
_read_file_content(notebook_path)
48+
mock_process.assert_called_once_with(notebook_path)
49+
50+
51+
def test_read_file_content_with_non_notebook(tmp_path: Path):
52+
py_file_path = tmp_path / "dummy_file.py"
53+
py_file_path.write_text("print('Hello')", encoding="utf-8")
54+
55+
with patch("gitingest.ingest_from_query.process_notebook") as mock_process:
56+
_read_file_content(py_file_path)
57+
mock_process.assert_not_called()
58+
59+
4060
# TODO: test with include patterns: ['*.txt']
4161
# TODO: test with wrong include patterns: ['*.qwerty']
4262

0 commit comments

Comments
 (0)