ray-project · maxpumperla · Aug 22, 2023 · Aug 22, 2023
diff --git a/.gitignore b/.gitignore
@@ -107,3 +107,6 @@ airflow/airflow.db
 
 # scraped folders
 docs.ray.io/
+
+# book and other source folders
+data/
diff --git a/app/index.py b/app/index.py
@@ -87,7 +87,7 @@ def path_to_uri(path, scheme="https://", domain="docs.ray.io"):
  return scheme + domain + path.split(domain)[-1]
 
 
-def parse_file(record):
+def parse_html_file(record):
  html_content = load_html_file(record["path"])
  if not html_content:
  return []
@@ -100,6 +100,17 @@ def parse_file(record):
  ]
 
 
+def parse_text_file(record):
+ with open(record["path"]) as f:
+ text = f.read()
+ return [
+ {
+ "source": str(record["path"]),
+ "text": text,
+ }
+ ]
+
+
 class EmbedChunks:
  def __init__(self, model_name):
  self.embedding_model = HuggingFaceEmbeddings(
@@ -139,6 +150,7 @@ def __call__(self, batch):
 @app.command()
 def create_index(
  docs_path: Annotated[str, typer.Option(help="location of data")] = DOCS_PATH,
+ extension_type: Annotated[str, typer.Option(help="type of data")] = "html",
  embedding_model: Annotated[str, typer.Option(help="embedder")] = EMBEDDING_MODEL,
  chunk_size: Annotated[int, typer.Option(help="chunk size")] = CHUNK_SIZE,
  chunk_overlap: Annotated[int, typer.Option(help="chunk overlap")] = CHUNK_OVERLAP,
@@ -148,11 +160,17 @@ def create_index(
 
  # Dataset
  ds = ray.data.from_items(
- [{"path": path} for path in Path(docs_path).rglob("*.html") if not path.is_dir()]
+ [
+ {"path": path}
+ for path in Path(docs_path).rglob(f"*.{extension_type}")
+ if not path.is_dir()
+ ]
  )
 
  # Sections
- sections_ds = ds.flat_map(parse_file)
+ parser = parse_html_file if extension_type == "html" else parse_text_file
+ sections_ds = ds.flat_map(parser)
+ # TODO: do we really need to take_all()? Bring the splitter to the cluster
  sections = sections_ds.take_all()
 
  # Chunking

diff --git a/dashboard/pages/1_✨_Generation.py b/dashboard/pages/1_✨_Generation.py
@@ -9,7 +9,7 @@
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from pgvector.psycopg import register_vector
 
-from app.index import parse_file
+from app.index import parse_html_file
 from app.query import generate_response
 
 
@@ -38,7 +38,7 @@ def get_ds(docs_path):
 docs_page_url = st.text_input("Docs page URL", "https://docs.ray.io/en/master/train/faq.html")
 docs_page_path = docs_path_str + docs_page_url.split("docs.ray.io/en/master/")[-1]
 with st.expander("View sections"):
- sections = parse_file({"path": docs_page_path})
+ sections = parse_html_file({"path": docs_page_path})
  st.write(sections)
 
 # Chunks