|
11 | 11 | from numpy.typing import NDArray
|
12 | 12 | import numpy as np
|
13 | 13 |
|
| 14 | +import cocoindex.functions.chonkie as coco_chonkie |
| 15 | + |
14 | 16 |
|
15 | 17 | @cocoindex.op.function()
|
16 | 18 | def extract_extension(filename: str) -> str:
|
17 | 19 | """Extract the extension of a filename."""
|
18 | 20 | return os.path.splitext(filename)[1]
|
19 | 21 |
|
20 | 22 |
|
| 23 | +@cocoindex.op.function() |
| 24 | +def extract_language(extension: str) -> str | None: |
| 25 | + """Extract the extension of a filename.""" |
| 26 | + match extension: |
| 27 | + case ".py": |
| 28 | + return "python" |
| 29 | + case ".md" | ".mdx": |
| 30 | + return "markdown" |
| 31 | + case ".toml": |
| 32 | + return "toml" |
| 33 | + case ".rs": |
| 34 | + return "rust" |
| 35 | + return None |
| 36 | + |
| 37 | + |
21 | 38 | @cocoindex.transform_flow()
|
22 | 39 | def code_to_embedding(
|
23 | 40 | text: cocoindex.DataSlice[str],
|
@@ -77,13 +94,42 @@ def github_code_indexing_flow(
|
77 | 94 |
|
78 | 95 | with data_scope["files"].row() as file:
|
79 | 96 | file["extension"] = file["filename"].transform(extract_extension)
|
| 97 | + |
| 98 | + # Use SplitRecursively |
80 | 99 | file["chunks"] = file["content"].transform(
|
81 | 100 | cocoindex.functions.SplitRecursively(),
|
82 | 101 | language=file["extension"],
|
83 | 102 | chunk_size=1000,
|
84 | 103 | min_chunk_size=300,
|
85 | 104 | chunk_overlap=300,
|
86 | 105 | )
|
| 106 | + |
| 107 | + # Use ChonkieRecursiveChunker |
| 108 | + # file["chunks"] = file["content"].transform( |
| 109 | + # coco_chonkie.ChonkieRecursiveChunker( |
| 110 | + # chunk_size=1000, |
| 111 | + # ) |
| 112 | + # ) |
| 113 | + |
| 114 | + # Use ChonkieCodeChunker |
| 115 | + # file["language"] = file["extension"].transform(extract_language) |
| 116 | + # file["chunks"] = file["content"].transform( |
| 117 | + # coco_chonkie.ChonkieCodeChunker(chunk_size=1000), |
| 118 | + # language=file["language"], |
| 119 | + # ) |
| 120 | + |
| 121 | + # Use ChonkieSemanticChunker |
| 122 | + # file["chunks"] = file["content"].transform( |
| 123 | + # coco_chonkie.ChonkieSemanticChunker( |
| 124 | + # chunk_size=1000, |
| 125 | + # ) |
| 126 | + # ) |
| 127 | + |
| 128 | + # Use ChonkieNeuralChunker |
| 129 | + # file["chunks"] = file["content"].transform( |
| 130 | + # coco_chonkie.ChonkieNeuralChunker(device_map="mps"), |
| 131 | + # ) |
| 132 | + |
87 | 133 | with file["chunks"].row() as chunk:
|
88 | 134 | chunk["embedding"] = chunk["text"].call(code_to_embedding)
|
89 | 135 | code_embeddings.collect(
|
|
0 commit comments