22Module for fetching the HTML node
33"""
44
5- from typing import List
6- from langchain_community .document_loaders import AsyncHtmlLoader
5+ from typing import List , Optional
6+ from langchain_community .document_loaders import AsyncChromiumLoader
77from langchain_core .documents import Document
88from .base_node import BaseNode
99from ..utils .remover import remover
@@ -37,7 +37,7 @@ class FetchNode(BaseNode):
3737 to succeed.
3838 """
3939
40- def __init__ (self , input : str , output : List [str ], node_name : str = "Fetch" ):
40+ def __init__ (self , input : str , output : List [str ], node_config : Optional [ dict ], node_name : str = "Fetch" ):
4141 """
4242 Initializes the FetchHTMLNode with a node name and node type.
4343 Arguments:
@@ -46,6 +46,8 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
4646 """
4747 super ().__init__ (node_name , "node" , input , output , 1 )
4848
49+ self .headless = True if node_config is None else node_config .get ("headless" , True )
50+
4951 def execute (self , state ):
5052 """
5153 Executes the node's logic to fetch HTML content from a specified URL and
@@ -79,14 +81,21 @@ def execute(self, state):
7981
8082 else :
8183 if self .node_config is not None and self .node_config .get ("endpoint" ) is not None :
82- loader = AsyncHtmlLoader (
83- source , proxies = {"http" : self .node_config ["endpoint" ]})
84+
85+ loader = AsyncChromiumLoader (
86+ [source ],
87+ proxies = {"http" : self .node_config ["endpoint" ]},
88+ headless = self .headless ,
89+ )
8490 else :
85- loader = AsyncHtmlLoader (source )
91+ loader = AsyncChromiumLoader (
92+ [source ],
93+ headless = self .headless ,
94+ )
8695
8796 document = loader .load ()
8897 compressed_document = [
89- Document (page_content = remover (str (document )))]
98+ Document (page_content = remover (str (document [ 0 ]. page_content )))]
9099
91100 state .update ({self .output [0 ]: compressed_document })
92101 return state
0 commit comments