Skip to content

Commit 68b2562

Browse files
committed
add examples for DetectProgrammingLanguage and halfvec
1 parent cfac641 commit 68b2562

File tree

1 file changed

+19
-36
lines changed

1 file changed

+19
-36
lines changed

github_code_indexing/main.py

Lines changed: 19 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from dotenv import load_dotenv
55
from psycopg_pool import ConnectionPool
66
from pgvector.psycopg import register_vector
7-
from typing import Any
87
import functools
98
import cocoindex
109
import os
@@ -14,27 +13,6 @@
1413
import cocoindex.functions.chonkie as coco_chonkie
1514

1615

17-
@cocoindex.op.function()
18-
def extract_extension(filename: str) -> str:
19-
"""Extract the extension of a filename."""
20-
return os.path.splitext(filename)[1]
21-
22-
23-
@cocoindex.op.function()
24-
def extract_language(extension: str) -> str | None:
25-
"""Extract the extension of a filename."""
26-
match extension:
27-
case ".py":
28-
return "python"
29-
case ".md" | ".mdx":
30-
return "markdown"
31-
case ".toml":
32-
return "toml"
33-
case ".rs":
34-
return "rust"
35-
return None
36-
37-
3816
@cocoindex.transform_flow()
3917
def code_to_embedding(
4018
text: cocoindex.DataSlice[str],
@@ -93,16 +71,18 @@ def github_code_indexing_flow(
9371
code_embeddings = data_scope.add_collector()
9472

9573
with data_scope["files"].row() as file:
96-
file["extension"] = file["filename"].transform(extract_extension)
74+
file["language"] = file["filename"].transform(
75+
cocoindex.functions.DetectProgrammingLanguage()
76+
)
9777

9878
# Use SplitRecursively
99-
file["chunks"] = file["content"].transform(
100-
cocoindex.functions.SplitRecursively(),
101-
language=file["extension"],
102-
chunk_size=1000,
103-
min_chunk_size=300,
104-
chunk_overlap=300,
105-
)
79+
# file["chunks"] = file["content"].transform(
80+
# cocoindex.functions.SplitRecursively(),
81+
# language=file["language"],
82+
# chunk_size=1000,
83+
# min_chunk_size=300,
84+
# chunk_overlap=300,
85+
# )
10686

10787
# Use ChonkieRecursiveChunker
10888
# file["chunks"] = file["content"].transform(
@@ -112,11 +92,10 @@ def github_code_indexing_flow(
11292
# )
11393

11494
# Use ChonkieCodeChunker
115-
# file["language"] = file["extension"].transform(extract_language)
116-
# file["chunks"] = file["content"].transform(
117-
# coco_chonkie.ChonkieCodeChunker(chunk_size=1000),
118-
# language=file["language"],
119-
# )
95+
file["chunks"] = file["content"].transform(
96+
coco_chonkie.ChonkieCodeChunker(chunk_size=1000),
97+
language=file["language"],
98+
)
12099

121100
# Use ChonkieSemanticChunker
122101
# file["chunks"] = file["content"].transform(
@@ -143,7 +122,11 @@ def github_code_indexing_flow(
143122

144123
code_embeddings.export(
145124
"code_embeddings",
146-
cocoindex.targets.Postgres(),
125+
cocoindex.targets.Postgres(
126+
column_options={
127+
"embedding": cocoindex.targets.PostgresColumnOptions(type="halfvec"),
128+
}
129+
),
147130
primary_key_fields=["filename", "location"],
148131
vector_indexes=[
149132
cocoindex.VectorIndexDef(

0 commit comments

Comments
 (0)