4
4
from dotenv import load_dotenv
5
5
from psycopg_pool import ConnectionPool
6
6
from pgvector .psycopg import register_vector
7
- from typing import Any
8
7
import functools
9
8
import cocoindex
10
9
import os
14
13
import cocoindex .functions .chonkie as coco_chonkie
15
14
16
15
17
- @cocoindex .op .function ()
18
- def extract_extension (filename : str ) -> str :
19
- """Extract the extension of a filename."""
20
- return os .path .splitext (filename )[1 ]
21
-
22
-
23
- @cocoindex .op .function ()
24
- def extract_language (extension : str ) -> str | None :
25
- """Extract the extension of a filename."""
26
- match extension :
27
- case ".py" :
28
- return "python"
29
- case ".md" | ".mdx" :
30
- return "markdown"
31
- case ".toml" :
32
- return "toml"
33
- case ".rs" :
34
- return "rust"
35
- return None
36
-
37
-
38
16
@cocoindex .transform_flow ()
39
17
def code_to_embedding (
40
18
text : cocoindex .DataSlice [str ],
@@ -93,16 +71,18 @@ def github_code_indexing_flow(
93
71
code_embeddings = data_scope .add_collector ()
94
72
95
73
with data_scope ["files" ].row () as file :
96
- file ["extension" ] = file ["filename" ].transform (extract_extension )
74
+ file ["language" ] = file ["filename" ].transform (
75
+ cocoindex .functions .DetectProgrammingLanguage ()
76
+ )
97
77
98
78
# Use SplitRecursively
99
- file ["chunks" ] = file ["content" ].transform (
100
- cocoindex .functions .SplitRecursively (),
101
- language = file ["extension " ],
102
- chunk_size = 1000 ,
103
- min_chunk_size = 300 ,
104
- chunk_overlap = 300 ,
105
- )
79
+ # file["chunks"] = file["content"].transform(
80
+ # cocoindex.functions.SplitRecursively(),
81
+ # language=file["language "],
82
+ # chunk_size=1000,
83
+ # min_chunk_size=300,
84
+ # chunk_overlap=300,
85
+ # )
106
86
107
87
# Use ChonkieRecursiveChunker
108
88
# file["chunks"] = file["content"].transform(
@@ -112,11 +92,10 @@ def github_code_indexing_flow(
112
92
# )
113
93
114
94
# Use ChonkieCodeChunker
115
- # file["language"] = file["extension"].transform(extract_language)
116
- # file["chunks"] = file["content"].transform(
117
- # coco_chonkie.ChonkieCodeChunker(chunk_size=1000),
118
- # language=file["language"],
119
- # )
95
+ file ["chunks" ] = file ["content" ].transform (
96
+ coco_chonkie .ChonkieCodeChunker (chunk_size = 1000 ),
97
+ language = file ["language" ],
98
+ )
120
99
121
100
# Use ChonkieSemanticChunker
122
101
# file["chunks"] = file["content"].transform(
@@ -143,7 +122,11 @@ def github_code_indexing_flow(
143
122
144
123
code_embeddings .export (
145
124
"code_embeddings" ,
146
- cocoindex .targets .Postgres (),
125
+ cocoindex .targets .Postgres (
126
+ column_options = {
127
+ "embedding" : cocoindex .targets .PostgresColumnOptions (type = "halfvec" ),
128
+ }
129
+ ),
147
130
primary_key_fields = ["filename" , "location" ],
148
131
vector_indexes = [
149
132
cocoindex .VectorIndexDef (
0 commit comments