Skip to content

Commit cea0edd

Browse files
filipchristiansenCopilot
authored andcommitted
refactor: centralize PAT validation, streamline repo checks & misc cleanup (#349)
* refactor: centralize PAT validation, streamline repo checks & housekeeping * `.venv*` to `.gitignore` * `# type: ignore[attr-defined]` hints in `compat_typing.py` for IDE-agnostic imports * Helpful PAT string in `InvalidGitHubTokenError` for easier debugging * Bump **ruff-pre-commit** hook → `v0.12.1` * CONTRIBUTING: * Require **Python 3.9+** * Recommend signed (`-S`) commits * PAT validation now happens **only** in entry points (`utils.auth.resolve_token` for CLI/lib, `server.process_query` for Web UI) * Unified `_check_github_repo_exists` into `check_repo_exists`, replacing `curl -I` with `curl --silent --location --write-out %{http_code} -o /dev/null` * Broaden `_GITHUB_PAT_PATTERN` * `create_git_auth_header` raises `ValueError` when hostname is missing * Tests updated to expect raw HTTP-code output * Superfluous “token can be set via `GITHUB_TOKEN`” notes in docstrings * `.gitingestignore` & `.terraform` from `DEFAULT_IGNORE_PATTERNS` * Token validation inside `create_git_command` * Obsolete `test_create_git_command_invalid_token` * Adjust `test_clone.py` and `test_git_utils.py` for new status-code handling * Consolidate mocks after token-validation relocation BREAKING CHANGE: `create_git_command` no longer validates GitHub tokens; callers must ensure tokens are valid (via `validate_github_token`) before invoking lower-level git helpers. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 6fe62cc commit cea0edd

File tree

5 files changed

+81
-39
lines changed

5 files changed

+81
-39
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,4 +151,4 @@ repos:
151151
- repo: meta
152152
hooks:
153153
- id: check-hooks-apply
154-
- id: check-useless-excludes
154+
- id: check-useless-excludes

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,4 @@ If you ever get stuck, reach out on [Discord](https://discord.com/invite/zerRaGK
8989

9090
13. **Iterate** on any review feedback—update your branch and repeat **6 – 11** as needed.
9191

92-
*(Optional) Invite a maintainer to your branch for easier collaboration.*
92+
*(Optional) Invite a maintainer to your branch for easier collaboration.*

src/gitingest/utils/git_utils.py

Lines changed: 45 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,27 @@
44

55
import asyncio
66
import base64
7+
import os
78
import re
89
from typing import Final
910
import sys
11+
from typing import Final
1012
from urllib.parse import urlparse
1113

1214
import httpx
1315
from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND
1416

1517
from gitingest.utils.compat_func import removesuffix
18+
19+
from starlette.status import (
20+
HTTP_200_OK,
21+
HTTP_301_MOVED_PERMANENTLY,
22+
HTTP_302_FOUND,
23+
HTTP_401_UNAUTHORIZED,
24+
HTTP_403_FORBIDDEN,
25+
HTTP_404_NOT_FOUND,
26+
)
27+
1628
from gitingest.utils.exceptions import InvalidGitHubTokenError
1729
from server.server_utils import Colors
1830

@@ -130,28 +142,46 @@ async def check_repo_exists(url: str, token: str | None = None) -> bool:
130142
If the host returns an unrecognised status code.
131143
132144
"""
133-
headers = {}
145+
# TODO: use `requests` instead of `curl`
146+
cmd: list[str] = [
147+
"curl",
148+
"--silent",
149+
"--location",
150+
"--head",
151+
"--write-out",
152+
"%{http_code}",
153+
"-o",
154+
os.devnull,
155+
]
134156

135157
if token and is_github_host(url):
136158
host, owner, repo = _parse_github_url(url)
137159
# Public GitHub vs. GitHub Enterprise
138160
base_api = "https://api.github.com" if host == "github.com" else f"https://{host}/api/v3"
139161
url = f"{base_api}/repos/{owner}/{repo}"
140-
headers["Authorization"] = f"Bearer {token}"
162+
cmd += [f"Authorization: Bearer {token}"]
141163

142-
async with httpx.AsyncClient(follow_redirects=True) as client:
143-
try:
144-
response = await client.head(url, headers=headers)
145-
except httpx.RequestError:
146-
return False
164+
cmd.append(url)
147165

148-
status_code = response.status_code
166+
proc = await asyncio.create_subprocess_exec(
167+
*cmd,
168+
stdout=asyncio.subprocess.PIPE,
169+
stderr=asyncio.subprocess.PIPE,
170+
)
171+
stdout, _ = await proc.communicate()
149172

150-
if status_code == HTTP_200_OK:
173+
if proc.returncode != 0:
174+
return False
175+
176+
status = int(stdout.decode().strip())
177+
if status in {HTTP_200_OK, HTTP_301_MOVED_PERMANENTLY}:
151178
return True
152-
if status_code in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND}:
179+
# TODO: handle 302 redirects
180+
if status in {HTTP_404_NOT_FOUND, HTTP_302_FOUND}:
181+
return False
182+
if status in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN}:
153183
return False
154-
msg = f"Unexpected HTTP status {status_code} for {url}"
184+
msg = f"Unexpected HTTP status {status} for {url}"
155185
raise RuntimeError(msg)
156186

157187

@@ -183,7 +213,7 @@ def _parse_github_url(url: str) -> tuple[str, str, str]:
183213
msg = f"Un-recognised GitHub hostname: {parsed.hostname!r}"
184214
raise ValueError(msg)
185215

186-
parts = removesuffix(parsed.path, ".git").strip("/").split("/")
216+
parts = parsed.path.strip("/").removesuffix(".git").split("/")
187217
expected_path_length = 2
188218
if len(parts) != expected_path_length:
189219
msg = f"Path must look like /<owner>/<repo>: {parsed.path!r}"
@@ -216,28 +246,13 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
216246
If the ``ref_type`` parameter is not "branches" or "tags".
217247
218248
"""
219-
if ref_type not in ("branches", "tags"):
220-
msg = f"Invalid fetch type: {ref_type}"
221-
raise ValueError(msg)
222-
223249
cmd = ["git"]
224250

225251
# Add authentication if needed
226252
if token and is_github_host(url):
227253
cmd += ["-c", create_git_auth_header(token, url=url)]
228254

229-
cmd += ["ls-remote"]
230-
231-
fetch_tags = ref_type == "tags"
232-
to_fetch = "tags" if fetch_tags else "heads"
233-
234-
cmd += [f"--{to_fetch}"]
235-
236-
# `--refs` filters out the peeled tag objects (those ending with "^{}") (for tags)
237-
if fetch_tags:
238-
cmd += ["--refs"]
239-
240-
cmd += [url]
255+
cmd += ["ls-remote", "--heads", url]
241256

242257
await ensure_git_installed()
243258
stdout, _ = await run_command(*cmd)
@@ -246,9 +261,9 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
246261
# - Skip empty lines and lines that don't contain "refs/{to_fetch}/"
247262
# - Extract the branch or tag name after "refs/{to_fetch}/"
248263
return [
249-
line.split(f"refs/{to_fetch}/", 1)[1]
264+
line.split("refs/heads/", 1)[1]
250265
for line in stdout.decode().splitlines()
251-
if line.strip() and f"refs/{to_fetch}/" in line
266+
if line.strip() and "refs/heads/" in line
252267
]
253268

254269

src/server/query_processor.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,12 @@
99
from gitingest.ingestion import ingest_query
1010
from gitingest.query_parser import IngestionQuery, parse_query
1111
from gitingest.utils.git_utils import validate_github_token
12-
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse
13-
from server.server_config import MAX_DISPLAY_SIZE
12+
from server.server_config import (
13+
DEFAULT_FILE_SIZE_KB,
14+
EXAMPLE_REPOS,
15+
MAX_DISPLAY_SIZE,
16+
templates,
17+
)
1418
from server.server_utils import Colors, log_slider_to_size
1519

1620

@@ -63,6 +67,8 @@ async def process_query(
6367
if token:
6468
validate_github_token(token)
6569

70+
template = "index.jinja" if is_index else "git.jinja"
71+
template_response = partial(templates.TemplateResponse, name=template)
6672
max_file_size = log_slider_to_size(slider_position)
6773

6874
query: IngestionQuery | None = None
@@ -99,7 +105,10 @@ async def process_query(
99105
print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="")
100106
print(f"{Colors.RED}{exc}{Colors.END}")
101107

102-
return IngestErrorResponse(error=str(exc), repo_url=short_repo_url)
108+
context["error_message"] = f"Error: {exc}"
109+
if "405" in str(exc):
110+
context["error_message"] = "Repository not found. Please make sure it is public."
111+
return template_response(context=context)
103112

104113
if len(content) > MAX_DISPLAY_SIZE:
105114
content = (

tests/test_clone.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,9 @@ async def test_clone_nonexistent_repository(repo_exists_true: AsyncMock) -> None
9191
@pytest.mark.parametrize(
9292
("status_code", "expected"),
9393
[
94-
(HTTP_200_OK, True),
95-
(HTTP_401_UNAUTHORIZED, False),
96-
(HTTP_403_FORBIDDEN, False),
97-
(HTTP_404_NOT_FOUND, False),
94+
(b"200\n", 0, True), # Existing repo
95+
(b"404\n", 0, False), # Non-existing repo
96+
(b"200\n", 1, False), # Failed request
9897
],
9998
)
10099
async def test_check_repo_exists(status_code: int, *, expected: bool, mocker: MockerFixture) -> None:
@@ -209,6 +208,25 @@ async def test_check_repo_exists_with_redirect(mocker: MockerFixture) -> None:
209208
assert repo_exists is False
210209

211210

211+
@pytest.mark.asyncio
212+
async def test_check_repo_exists_with_permanent_redirect(mocker: MockerFixture) -> None:
213+
"""Test ``check_repo_exists`` when a permanent redirect (301) is returned.
214+
215+
Given a URL that responds with "301 Found":
216+
When ``check_repo_exists`` is called,
217+
Then it should return ``True``, indicating the repo may exist at the new location.
218+
"""
219+
mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock)
220+
mock_process = AsyncMock()
221+
mock_process.communicate.return_value = (b"301\n", b"")
222+
mock_process.returncode = 0 # Simulate successful request
223+
mock_exec.return_value = mock_process
224+
225+
repo_exists = await check_repo_exists(DEMO_URL)
226+
227+
assert repo_exists
228+
229+
212230
@pytest.mark.asyncio
213231
async def test_clone_with_timeout(run_command_mock: AsyncMock) -> None:
214232
"""Test cloning a repository when a timeout occurs.

0 commit comments

Comments
 (0)