Skip to content

Commit de19278

Browse files
Merge pull request #223 from cyclotruc/refactor/simplify-codebase
refactor: refactor ingestion output, remove unused exception and simplify cloning & FileSystem logic
2 parents ee8a351 + 2c593bf commit de19278

File tree

5 files changed

+152
-198
lines changed

5 files changed

+152
-198
lines changed

src/gitingest/cloning.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,12 @@ async def clone_repo(config: CloneConfig) -> None:
100100
checkout_cmd = ["git", "-C", local_path]
101101

102102
if partial_clone:
103+
subpath = config.subpath.lstrip("/")
103104
if config.blob:
104-
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name
105-
checkout_cmd += ["sparse-checkout", "set", Path(config.subpath.lstrip("/")).parent]
106-
else:
107-
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")]
105+
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
106+
subpath = str(Path(subpath).parent.as_posix())
107+
108+
checkout_cmd += ["sparse-checkout", "set", subpath]
108109

109110
if commit:
110111
checkout_cmd += ["checkout", commit]

src/gitingest/exceptions.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,27 +30,6 @@ class AsyncTimeoutError(Exception):
3030
"""
3131

3232

33-
class MaxFilesReachedError(Exception):
34-
"""Exception raised when the maximum number of files is reached."""
35-
36-
def __init__(self, max_files: int) -> None:
37-
super().__init__(f"Maximum number of files ({max_files}) reached.")
38-
39-
40-
class MaxFileSizeReachedError(Exception):
41-
"""Exception raised when the maximum file size is reached."""
42-
43-
def __init__(self, max_size: int):
44-
super().__init__(f"Maximum file size limit ({max_size/1024/1024:.1f}MB) reached.")
45-
46-
47-
class AlreadyVisitedError(Exception):
48-
"""Exception raised when a symlink target has already been visited."""
49-
50-
def __init__(self, path: str) -> None:
51-
super().__init__(f"Symlink target already visited: {path}")
52-
53-
5433
class InvalidNotebookError(Exception):
5534
"""Exception raised when a Jupyter notebook is invalid or cannot be processed."""
5635

src/gitingest/filesystem_schema.py

Lines changed: 61 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@
77
from enum import Enum, auto
88
from pathlib import Path
99

10-
from gitingest.exceptions import InvalidNotebookError
1110
from gitingest.utils.ingestion_utils import _get_encoding_list
1211
from gitingest.utils.notebook_utils import process_notebook
1312
from gitingest.utils.textfile_checker_utils import is_textfile
1413

15-
SEPARATOR = "=" * 48 + "\n"
14+
SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48
1615

1716

1817
class FileSystemNodeType(Enum):
@@ -36,108 +35,105 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
3635
"""
3736
Class representing a node in the file system (either a file or directory).
3837
39-
This class has more than the recommended number of attributes because it needs to
40-
track various properties of files and directories for comprehensive analysis.
38+
Tracks properties of files/directories for comprehensive analysis.
4139
"""
4240

4341
name: str
44-
type: FileSystemNodeType # e.g., "directory" or "file"
42+
type: FileSystemNodeType
4543
path_str: str
4644
path: Path
4745
size: int = 0
4846
file_count: int = 0
4947
dir_count: int = 0
5048
depth: int = 0
51-
children: list[FileSystemNode] = field(default_factory=list) # Using default_factory instead of empty list
49+
children: list[FileSystemNode] = field(default_factory=list)
5250

5351
def sort_children(self) -> None:
5452
"""
5553
Sort the children nodes of a directory according to a specific order.
5654
5755
Order of sorting:
58-
1. README.md first
59-
2. Regular files (not starting with dot)
60-
3. Hidden files (starting with dot)
61-
4. Regular directories (not starting with dot)
62-
5. Hidden directories (starting with dot)
63-
All groups are sorted alphanumerically within themselves.
64-
"""
65-
# Separate files and directories
66-
files = [child for child in self.children if child.type == FileSystemNodeType.FILE]
67-
directories = [child for child in self.children if child.type == FileSystemNodeType.DIRECTORY]
56+
2. Regular files (not starting with dot)
57+
3. Hidden files (starting with dot)
58+
4. Regular directories (not starting with dot)
59+
5. Hidden directories (starting with dot)
6860
69-
# Find README.md
70-
readme_files = [f for f in files if f.name.lower() == "readme.md"]
71-
other_files = [f for f in files if f.name.lower() != "readme.md"]
61+
All groups are sorted alphanumerically within themselves.
7262
73-
# Separate hidden and regular files/directories
74-
regular_files = [f for f in other_files if not f.name.startswith(".")]
75-
hidden_files = [f for f in other_files if f.name.startswith(".")]
76-
regular_dirs = [d for d in directories if not d.name.startswith(".")]
77-
hidden_dirs = [d for d in directories if d.name.startswith(".")]
63+
Raises
64+
------
65+
ValueError
66+
If the node is not a directory.
67+
"""
68+
if self.type != FileSystemNodeType.DIRECTORY:
69+
raise ValueError("Cannot sort children of a non-directory node")
7870

79-
# Sort each group alphanumerically
80-
regular_files.sort(key=lambda x: x.name)
81-
hidden_files.sort(key=lambda x: x.name)
82-
regular_dirs.sort(key=lambda x: x.name)
83-
hidden_dirs.sort(key=lambda x: x.name)
71+
def _sort_key(child: FileSystemNode) -> tuple[int, str]:
72+
# returns the priority order for the sort function, 0 is first
73+
# Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir
74+
name = child.name.lower()
75+
if child.type == FileSystemNodeType.FILE:
76+
if name == "readme.md":
77+
return (0, name)
78+
return (1 if not name.startswith(".") else 2, name)
79+
return (3 if not name.startswith(".") else 4, name)
8480

85-
self.children = readme_files + regular_files + hidden_files + regular_dirs + hidden_dirs
81+
self.children.sort(key=_sort_key)
8682

8783
@property
8884
def content_string(self) -> str:
8985
"""
90-
Return the content of the node as a string.
91-
92-
This property returns the content of the node as a string, including the path and content.
86+
Return the content of the node as a string, including path and content.
9387
9488
Returns
9589
-------
9690
str
9791
A string representation of the node's content.
9892
"""
99-
content_repr = SEPARATOR
93+
parts = [
94+
SEPARATOR,
95+
f"File: {str(self.path_str).replace(os.sep, '/')}",
96+
SEPARATOR,
97+
f"{self.content}",
98+
]
10099

101-
# Use forward slashes in output paths
102-
content_repr += f"File: {str(self.path_str).replace(os.sep, '/')}\n"
103-
content_repr += SEPARATOR
104-
content_repr += f"{self.content}\n\n"
105-
return content_repr
100+
return "\n".join(parts) + "\n\n"
106101

107102
@property
108103
def content(self) -> str: # pylint: disable=too-many-return-statements
109104
"""
110-
Read the content of a file.
111-
112-
This function attempts to open a file and read its contents using UTF-8 encoding.
113-
If an error occurs during reading (e.g., file is not found or permission error),
114-
it returns an error message.
105+
Read the content of a file if it's text (or a notebook). Return an error message otherwise.
115106
116107
Returns
117108
-------
118109
str
119110
The content of the file, or an error message if the file could not be read.
111+
112+
Raises
113+
------
114+
ValueError
115+
If the node is a directory.
120116
"""
121-
if self.type == FileSystemNodeType.FILE and not is_textfile(self.path):
117+
if self.type == FileSystemNodeType.DIRECTORY:
118+
raise ValueError("Cannot read content of a directory node")
119+
120+
if not is_textfile(self.path):
122121
return "[Non-text file]"
123122

124-
try:
125-
if self.path.suffix == ".ipynb":
126-
try:
127-
return process_notebook(self.path)
128-
except Exception as exc:
129-
return f"Error processing notebook: {exc}"
130-
131-
for encoding in _get_encoding_list():
132-
try:
133-
with self.path.open(encoding=encoding) as f:
134-
return f.read()
135-
except UnicodeDecodeError:
136-
continue
137-
except OSError as exc:
138-
return f"Error reading file: {exc}"
139-
140-
return "Error: Unable to decode file with available encodings"
141-
142-
except (OSError, InvalidNotebookError) as exc:
143-
return f"Error reading file: {exc}"
123+
if self.path.suffix == ".ipynb":
124+
try:
125+
return process_notebook(self.path)
126+
except Exception as exc:
127+
return f"Error processing notebook: {exc}"
128+
129+
# Try multiple encodings
130+
for encoding in _get_encoding_list():
131+
try:
132+
with self.path.open(encoding=encoding) as f:
133+
return f.read()
134+
except UnicodeDecodeError:
135+
continue
136+
except OSError as exc:
137+
return f"Error reading file: {exc}"
138+
139+
return "Error: Unable to decode file with available encodings"

src/gitingest/ingestion.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES
88
from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats
9-
from gitingest.output_formatters import format_directory, format_single_file
9+
from gitingest.output_formatters import format_node
1010
from gitingest.query_parsing import ParsedQuery
1111
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
1212
from gitingest.utils.path_utils import _is_safe_symlink
@@ -38,7 +38,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]:
3838
Raises
3939
------
4040
ValueError
41-
If the specified path cannot be found or if the file is not a text file.
41+
If the path cannot be found, is not a file, or the file has no content.
4242
"""
4343
subpath = Path(query.subpath.strip("/")).as_posix()
4444
path = query.local_path / subpath
@@ -63,7 +63,11 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]:
6363
path_str=str(relative_path),
6464
path=path,
6565
)
66-
return format_single_file(file_node, query)
66+
67+
if not file_node.content:
68+
raise ValueError(f"File {file_node.name} has no content")
69+
70+
return format_node(file_node, query)
6771

6872
root_node = FileSystemNode(
6973
name=path.name,
@@ -80,7 +84,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]:
8084
stats=stats,
8185
)
8286

83-
return format_directory(root_node, query)
87+
return format_node(root_node, query)
8488

8589

8690
def apply_gitingest_file(path: Path, query: ParsedQuery) -> None:

0 commit comments

Comments
 (0)