Skip to content

Commit d00cde6

Browse files
committed
fix(pdf_scraper): fix the pdf scraper gaph
1 parent 00a392b commit d00cde6

File tree

2 files changed

+25
-39
lines changed

2 files changed

+25
-39
lines changed

scrapegraphai/graphs/abstract_graph.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
181181
try:
182182
self.model_token = models_tokens["ollama"][llm_params["model"]]
183183
except KeyError as exc:
184+
print("model not found, using default token size (8192)")
184185
self.model_token = 8192
185186
else:
186187
self.model_token = 8192
@@ -191,25 +192,28 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
191192
elif "hugging_face" in llm_params["model"]:
192193
try:
193194
self.model_token = models_tokens["hugging_face"][llm_params["model"]]
194-
except KeyError as exc:
195-
raise KeyError("Model not supported") from exc
195+
except KeyError:
196+
print("model not found, using default token size (8192)")
197+
self.model_token = 8192
196198
return HuggingFace(llm_params)
197199
elif "groq" in llm_params["model"]:
198200
llm_params["model"] = llm_params["model"].split("/")[-1]
199201

200202
try:
201203
self.model_token = models_tokens["groq"][llm_params["model"]]
202-
except KeyError as exc:
203-
raise KeyError("Model not supported") from exc
204+
except KeyError:
205+
print("model not found, using default token size (8192)")
206+
self.model_token = 8192
204207
return Groq(llm_params)
205208
elif "bedrock" in llm_params["model"]:
206209
llm_params["model"] = llm_params["model"].split("/")[-1]
207210
model_id = llm_params["model"]
208211
client = llm_params.get('client', None)
209212
try:
210213
self.model_token = models_tokens["bedrock"][llm_params["model"]]
211-
except KeyError as exc:
212-
raise KeyError("Model not supported") from exc
214+
except KeyError:
215+
print("model not found, using default token size (8192)")
216+
self.model_token = 8192
213217
return Bedrock({
214218
"client": client,
215219
"model_id": model_id,
@@ -218,13 +222,18 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
218222
}
219223
})
220224
elif "claude-3-" in llm_params["model"]:
221-
self.model_token = models_tokens["claude"]["claude3"]
225+
try:
226+
self.model_token = models_tokens["claude"]["claude3"]
227+
except KeyError:
228+
print("model not found, using default token size (8192)")
229+
self.model_token = 8192
222230
return Anthropic(llm_params)
223231
elif "deepseek" in llm_params["model"]:
224232
try:
225233
self.model_token = models_tokens["deepseek"][llm_params["model"]]
226-
except KeyError as exc:
227-
raise KeyError("Model not supported") from exc
234+
except KeyError:
235+
print("model not found, using default token size (8192)")
236+
self.model_token = 8192
228237
return DeepSeek(llm_params)
229238
else:
230239
raise ValueError(
@@ -312,10 +321,7 @@ def _create_embedder(self, embedder_config: dict) -> object:
312321
models_tokens["bedrock"][embedder_config["model"]]
313322
except KeyError as exc:
314323
raise KeyError("Model not supported") from exc
315-
return BedrockEmbeddings(client=client, model_id=embedder_config["model"])
316-
else:
317-
raise ValueError(
318-
"Model provided by the configuration not supported")
324+
return BedrockEmbeddings(client=client, model_id=embedder_config["model"])
319325

320326
def get_state(self, key=None) -> dict:
321327
"""""

scrapegraphai/graphs/pdf_scraper_graph.py

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
FetchNode,
1212
ParseNode,
1313
RAGNode,
14-
GenerateAnswerNode
14+
GenerateAnswerPDFNode
1515
)
1616

1717

@@ -48,7 +48,7 @@ class PDFScraperGraph(AbstractGraph):
4848
"""
4949

5050
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
51-
super().__init__(prompt, config, source, schema)
51+
super().__init__(prompt, config, source)
5252

5353
self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir"
5454

@@ -64,41 +64,21 @@ def _create_graph(self) -> BaseGraph:
6464
input='pdf | pdf_dir',
6565
output=["doc", "link_urls", "img_urls"],
6666
)
67-
parse_node = ParseNode(
68-
input="doc",
69-
output=["parsed_doc"],
70-
node_config={
71-
"chunk_size": self.model_token,
72-
}
73-
)
74-
rag_node = RAGNode(
75-
input="user_prompt & (parsed_doc | doc)",
76-
output=["relevant_chunks"],
77-
node_config={
78-
"llm_model": self.llm_model,
79-
"embedder_model": self.embedder_model,
80-
}
81-
)
82-
generate_answer_node = GenerateAnswerNode(
67+
generate_answer_node_pdf = GenerateAnswerPDFNode(
8368
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
8469
output=["answer"],
8570
node_config={
8671
"llm_model": self.llm_model,
87-
"schema": self.schema,
8872
}
8973
)
9074

9175
return BaseGraph(
9276
nodes=[
9377
fetch_node,
94-
parse_node,
95-
rag_node,
96-
generate_answer_node,
78+
generate_answer_node_pdf,
9779
],
9880
edges=[
99-
(fetch_node, parse_node),
100-
(parse_node, rag_node),
101-
(rag_node, generate_answer_node)
81+
(fetch_node, generate_answer_node_pdf)
10282
],
10383
entry_point=fetch_node
10484
)
@@ -114,4 +94,4 @@ def run(self) -> str:
11494
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
11595
self.final_state, self.execution_info = self.graph.execute(inputs)
11696

117-
return self.final_state.get("answer", "No answer found.")
97+
return self.final_state.get("answer", "No answer found.")

0 commit comments

Comments
 (0)