Skip to content

Commit cfac641

Browse files
committed
add examples for chonkie
1 parent 1d02207 commit cfac641

File tree

1 file changed

+46
-0
lines changed

1 file changed

+46
-0
lines changed

github_code_indexing/main.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,30 @@
1111
from numpy.typing import NDArray
1212
import numpy as np
1313

14+
import cocoindex.functions.chonkie as coco_chonkie
15+
1416

1517
@cocoindex.op.function()
1618
def extract_extension(filename: str) -> str:
1719
"""Extract the extension of a filename."""
1820
return os.path.splitext(filename)[1]
1921

2022

23+
@cocoindex.op.function()
24+
def extract_language(extension: str) -> str | None:
25+
"""Extract the extension of a filename."""
26+
match extension:
27+
case ".py":
28+
return "python"
29+
case ".md" | ".mdx":
30+
return "markdown"
31+
case ".toml":
32+
return "toml"
33+
case ".rs":
34+
return "rust"
35+
return None
36+
37+
2138
@cocoindex.transform_flow()
2239
def code_to_embedding(
2340
text: cocoindex.DataSlice[str],
@@ -77,13 +94,42 @@ def github_code_indexing_flow(
7794

7895
with data_scope["files"].row() as file:
7996
file["extension"] = file["filename"].transform(extract_extension)
97+
98+
# Use SplitRecursively
8099
file["chunks"] = file["content"].transform(
81100
cocoindex.functions.SplitRecursively(),
82101
language=file["extension"],
83102
chunk_size=1000,
84103
min_chunk_size=300,
85104
chunk_overlap=300,
86105
)
106+
107+
# Use ChonkieRecursiveChunker
108+
# file["chunks"] = file["content"].transform(
109+
# coco_chonkie.ChonkieRecursiveChunker(
110+
# chunk_size=1000,
111+
# )
112+
# )
113+
114+
# Use ChonkieCodeChunker
115+
# file["language"] = file["extension"].transform(extract_language)
116+
# file["chunks"] = file["content"].transform(
117+
# coco_chonkie.ChonkieCodeChunker(chunk_size=1000),
118+
# language=file["language"],
119+
# )
120+
121+
# Use ChonkieSemanticChunker
122+
# file["chunks"] = file["content"].transform(
123+
# coco_chonkie.ChonkieSemanticChunker(
124+
# chunk_size=1000,
125+
# )
126+
# )
127+
128+
# Use ChonkieNeuralChunker
129+
# file["chunks"] = file["content"].transform(
130+
# coco_chonkie.ChonkieNeuralChunker(device_map="mps"),
131+
# )
132+
87133
with file["chunks"].row() as chunk:
88134
chunk["embedding"] = chunk["text"].call(code_to_embedding)
89135
code_embeddings.collect(

0 commit comments

Comments
 (0)