Skip to content

Commit 6578c1c

Browse files
committed
Add the ability to filter by max file size
1 parent 16100ac commit 6578c1c

File tree

6 files changed

+79
-124
lines changed

6 files changed

+79
-124
lines changed

src/ingest.py

Lines changed: 40 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,45 @@ def generate_token_string(context_string: str) -> str:
269269
formatted_tokens = f"{total_gpt_tokens}"
270270
return formatted_tokens
271271

272+
273+
def ingest_single_file(path: str, query: dict) -> Dict:
274+
if not os.path.isfile(path):
275+
raise ValueError(f"Path {path} is not a file")
276+
277+
file_size = os.path.getsize(path)
278+
is_text = is_text_file(path)
279+
if not is_text:
280+
raise ValueError(f"File {path} is not a text file")
281+
282+
content = read_file_content(path)
283+
if file_size > query['max_file_size']:
284+
content = "[Content ignored: file too large]"
285+
286+
file_info = {
287+
"path": path.replace(query['local_path'], ""),
288+
"content": content,
289+
"size": file_size
290+
}
291+
292+
summary = (
293+
f"Repository: {query['user_name']}/{query['repo_name']}\n"
294+
f"File: {os.path.basename(path)}\n"
295+
f"Size: {file_size:,} bytes\n"
296+
f"Lines: {len(content.splitlines()):,}\n"
297+
)
298+
299+
300+
301+
files_content = create_file_content_string([file_info])
302+
tree = "Directory structure:\n└── " + os.path.basename(path)
303+
304+
305+
formatted_tokens = generate_token_string(files_content)
306+
if formatted_tokens:
307+
summary += f"\nEstimated tokens: {formatted_tokens}"
308+
return (summary, tree, files_content)
309+
310+
272311
def ingest_from_query(query: dict, ignore_patterns: List[str] = DEFAULT_IGNORE_PATTERNS) -> Dict:
273312
"""Main entry point for analyzing a codebase directory or single file."""
274313

@@ -277,43 +316,7 @@ def ingest_from_query(query: dict, ignore_patterns: List[str] = DEFAULT_IGNORE_P
277316
raise ValueError(f"{query['slug']} cannot be found, make sure the repository is public")
278317

279318
if query.get('type') == 'blob':
280-
if not os.path.isfile(path):
281-
raise ValueError(f"Path {path} is not a file")
282-
283-
file_size = os.path.getsize(path)
284-
is_text = is_text_file(path)
285-
if not is_text:
286-
raise ValueError(f"File {path} is not a text file")
287-
288-
content = read_file_content(path)
289-
if file_size > query['max_file_size']:
290-
content = "[Content ignored: file too large]"
291-
292-
file_info = {
293-
"path": path.replace(query['local_path'], ""),
294-
"content": content,
295-
"size": file_size
296-
}
297-
298-
summary = (
299-
f"Repository: {query['user_name']}/{query['repo_name']}\n"
300-
f"File: {os.path.basename(path)}\n"
301-
f"Size: {file_size:,} bytes\n"
302-
f"Lines: {len(content.splitlines()):,}\n"
303-
)
304-
305-
306-
307-
files_content = create_file_content_string([file_info])
308-
tree = "Directory structure:\n└── " + os.path.basename(path)
309-
310-
311-
print(files_content)
312-
formatted_tokens = generate_token_string(files_content)
313-
if formatted_tokens:
314-
summary += f"\nEstimated tokens: {formatted_tokens}"
315-
return (summary, tree, files_content)
316-
319+
return ingest_single_file(path, query)
317320
else:
318321
nodes = scan_directory(path, ignore_patterns, query['local_path'])
319322
files = extract_files_content(query, nodes, query['max_file_size'])

src/routers/dynamic.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from process_query import process_query
77
from config import MAX_DISPLAY_SIZE
88
from utils.limiter import limiter
9-
9+
from utils.log_convert import logSliderToSize
1010

1111
router = APIRouter()
1212
templates = Jinja2Templates(directory="templates")
@@ -21,15 +21,27 @@ async def catch_all(request: Request, full_path: str):
2121
"result": False,
2222
"loading": True,
2323
"github_url": f"https://github.com/{full_path}",
24+
"default_file_size": 243
2425
}
2526
)
2627

2728
@router.post("/{full_path:path}", response_class=HTMLResponse)
2829
@limiter.limit("10/minute")
29-
async def process_catch_all(request: Request, input_text: str = Form(...)):
30+
async def process_catch_all(
31+
request: Request,
32+
input_text: str = Form(...),
33+
max_file_size: int = Form(...)
34+
):
35+
36+
slider_position = max_file_size
37+
size_in_kb = logSliderToSize(max_file_size)
3038
try:
31-
parsed_url = parse_url(input_text)
32-
summary, tree, content = await process_query(parsed_url)
39+
parsed_query = parse_url(input_text, size_in_kb)
40+
41+
summary, tree, content = await process_query(
42+
parsed_query,
43+
44+
)
3345
except Exception as e:
3446
print(e)
3547
return templates.TemplateResponse(
@@ -38,11 +50,13 @@ async def process_catch_all(request: Request, input_text: str = Form(...)):
3850
"request": request,
3951
"result": False,
4052
"loading": False,
41-
"error_message": f"Error: \n {e}"
53+
"github_url": input_text,
54+
"error_message": f"Error: \n {e}",
55+
"default_file_size": slider_position,
56+
4257
}
4358
)
4459

45-
4660
if len(content) > MAX_DISPLAY_SIZE:
4761
content = f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, download full digest to see more)\n" + content[:MAX_DISPLAY_SIZE]
4862

@@ -55,6 +69,8 @@ async def process_catch_all(request: Request, input_text: str = Form(...)):
5569
"summary": summary,
5670
"tree": tree,
5771
"content": content,
58-
"ingest_id": parsed_url["id"]
72+
"ingest_id": parsed_query["id"],
73+
"github_url": input_text,
74+
"default_file_size": max_file_size
5975
}
6076
)

src/routers/index.py

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,19 @@
11
from fastapi import APIRouter, Request, Form
22
from fastapi.responses import HTMLResponse, JSONResponse
33
from fastapi.templating import Jinja2Templates
4-
import math
54

65
from utils.parse_url import parse_url
76
from utils.limiter import limiter
87
from process_query import process_query
98
from config import MAX_DISPLAY_SIZE, EXAMPLE_REPOS
9+
from utils.log_convert import logSliderToSize
1010

1111

1212
router = APIRouter()
1313
templates = Jinja2Templates(directory="templates")
1414

1515

16-
def logSliderToSize(position):
17-
"""Convert slider position to file size in KB"""
18-
minp = 0
19-
maxp = 500
20-
minv = math.log(1)
21-
maxv = math.log(102400)
22-
23-
scale = (maxv - minv) / (maxp - minp)
24-
return round(math.exp(minv + scale * (position - minp)))
16+
2517

2618

2719
@router.get("/", response_class=HTMLResponse)
@@ -31,7 +23,7 @@ async def home(request: Request):
3123
{
3224
"request": request,
3325
"examples": EXAMPLE_REPOS,
34-
"default_file_size": 250
26+
"default_file_size": 243
3527
}
3628
)
3729

@@ -43,12 +35,8 @@ async def index_post(
4335
input_text: str = Form(...),
4436
max_file_size: int = Form(...)
4537
):
46-
# Store original slider position (0-500)
4738
slider_position = max_file_size
48-
49-
# Convert to KB for processing
5039
size_in_kb = logSliderToSize(max_file_size)
51-
print(f"Processing repository with max file size: {size_in_kb}kb")
5240

5341
try:
5442
parsed_query = parse_url(input_text, size_in_kb)
@@ -69,7 +57,6 @@ async def index_post(
6957
if len(content) > MAX_DISPLAY_SIZE:
7058
content = f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE]
7159

72-
print(f"Slider position: {(max_file_size)}")
7360
return templates.TemplateResponse(
7461
"index.jinja.html",
7562
{

src/static/js/utils.js

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,7 @@ function logSliderToSize(position) {
129129
const minv = Math.log(1);
130130
const maxv = Math.log(102400);
131131

132-
const scale = (maxv - minv) / (maxp - minp);
133-
const value = Math.exp(minv + scale * (position - minp));
132+
const value = Math.exp(minv + (maxv - minv) * Math.pow(position / maxp, 1.5));
134133
return Math.round(value);
135134
}
136135

src/templates/components/github_form.jinja.html

Lines changed: 3 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@
3131
max="500"
3232
value="{{ default_file_size }}"
3333
class="w-full h-3
34-
bg-[#FE4A60] <!-- This defines the left (filled) part color -->
34+
bg-[#FE4A60]
3535
bg-no-repeat
3636
bg-[length:50%_100%]
37-
bg-[#FFF4DA] <!-- This defines the right (unfilled) part color -->
37+
bg-[#FFF4DA]
3838
appearance-none
3939
border-[3px]
4040
border-gray-900
@@ -70,63 +70,4 @@
7070
</div>
7171
{% endif %}
7272
</div>
73-
</div>
74-
75-
<script>
76-
document.addEventListener('DOMContentLoaded', function() {
77-
const slider = document.getElementById('file_size');
78-
const sizeValue = document.getElementById('size_value');
79-
const form = document.getElementById('ingestForm');
80-
81-
// Convert linear slider value (0-500) to logarithmic size (1KB to 100MB)
82-
function logSliderToSize(position) {
83-
const minp = 0;
84-
const maxp = 500;
85-
const minv = Math.log(1);
86-
const maxv = Math.log(102400);
87-
88-
const scale = (maxv - minv) / (maxp - minp);
89-
const value = Math.exp(minv + scale * (position - minp));
90-
return Math.round(value);
91-
}
92-
93-
function formatSize(sizeInKB) {
94-
if (sizeInKB >= 1024) {
95-
return Math.round(sizeInKB / 1024) + 'mb';
96-
}
97-
return Math.round(sizeInKB) + 'kb';
98-
}
99-
100-
function updateSlider() {
101-
const value = logSliderToSize(slider.value);
102-
sizeValue.textContent = formatSize(value);
103-
// Update the background size
104-
slider.style.backgroundSize = `${(slider.value / slider.max) * 100}% 100%`;
105-
}
106-
107-
// Initialize slider
108-
if (slider) {
109-
updateSlider(); // This will now update both the label and background on load
110-
}
111-
112-
// Update on slider change
113-
slider.addEventListener('input', updateSlider);
114-
115-
// Add form submit handler to convert size before sending
116-
form.addEventListener('submit', function(e) {
117-
// Get the actual KB value from the slider
118-
const kbValue = logSliderToSize(slider.value);
119-
120-
// Create a hidden input to send the actual KB value
121-
const hiddenInput = document.createElement('input');
122-
hiddenInput.type = 'hidden';
123-
hiddenInput.name = 'max_file_size';
124-
hiddenInput.value = kbValue;
125-
126-
// Replace the original slider input in the form submission
127-
form.appendChild(hiddenInput);
128-
});
129-
});
130-
131-
132-
</script>
73+
</div>

src/utils/log_convert.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import math
2+
3+
def logSliderToSize(position):
4+
"""Convert slider position to file size in KB"""
5+
maxp = 500
6+
minv = math.log(1)
7+
maxv = math.log(102400)
8+
9+
return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5)))

0 commit comments

Comments
 (0)