udan-jayanith · udan-jayanith · Aug 15, 2025 · Aug 15, 2025 · Aug 15, 2025 · Aug 15, 2025
@@ -1,12 +1,2 @@
-## v0.0.0-beta.3 <- current
-
-## v0.0.1
-* GetElementByClassName
-* GetElementByTagName
-* GetElementById
-* NodeList
-* GetElementsByClassName
-* GetElementsByTagName
-* GetElementsById
-* QuerySelector
-* QuerySelectorAll
+## v0.0.3
+- Closest
@@ -1,6 +1,6 @@
 # GoHTML
 
-A powerful and comprehensive HTML parser and DOM manipulation library for Go, bringing JavaScript-like DOM operations to the Go ecosystem.
+A HTML parse and a serializer for Go. GoHTML tries to keep semantic similar to JS-DOM API while trying to keep the API simple by not forcing JS-DOM model into GoHTML. Because of this GoHTML has node tree model. GoHTML tokenizer uses std net/html module for tokenizing in underlining layer. There for it's users responsibility to make sure inputs to GoHTML is UTF-8 encoded. GoHTML allows direct access to the node tree.
 
 ## Installation
 
@@ -50,12 +50,10 @@ Heres an example of fetching a website and parsing and then using querying metho
 ## Changelog
 
 Changes, bug fixes and new features in this version.
-
-- add: NodeList
-- add: Querying helper functions
-- add: ClassList
-- bug fix: Empty attribute value parsing bug fixed
-- changed: Renamed GetTraverser to NewTraverser
+- add: Tokenizer
+- add: NodeTreeBuilder
+- renamed: QuerySelector to Query
+- renamed: QuerySelectorAll to QueryAll
 
 ## Documentation
 

@@ -20,7 +20,7 @@ func TestFetchPostCovers(t *testing.T){
 t.Fatal(err)
 }
 
-nodeList := node.GetElementsByClassName("post-title")
+nodeList := node.QueryAll(".sm-feat .clearfix article")
 t.Log("Got ", nodeList.Len(), " post titles.")
 iter := nodeList.IterNodeList()
 for node := range iter{

@@ -1,6 +1,5 @@
 /*
-A powerful and comprehensive HTML parser and DOM manipulation library for Go,
-bringing JavaScript-like DOM operations to the Go ecosystem.
+A HTML parse and a serializer for Go. GoHTML tries to keep semantic similar to JS-DOM API while trying to keep the API simple by not forcing JS-DOM model into GoHTML. Because of this GoHTML has node tree model. GoHTML tokenizer uses std net/html module for tokenizing in underlining layer. There for it's users responsibility to make sure inputs to GoHTML is UTF-8 encoded. GoHTML allows direct access to the node tree.
 */
 package GoHtml
 

@@ -3,69 +3,24 @@ package GoHtml
 import (
 "io"
 "strings"
-
-"github.com/emirpasic/gods/stacks/linkedliststack"
 "golang.org/x/net/html"
 )
 
-// Decode reads from rd and create a node-tree. Then returns the root node and an error. If error were to occur it would be SyntaxError.
-func Decode(r io.Reader) (*Node, error) {
-rootNode := CreateTextNode("")
-stack := linkedliststack.New()
-currentNode := rootNode
 
-z := html.NewTokenizer(r)
+// Decode reads from rd and create a node-tree. Then returns the root node and nil.
+func Decode(r io.Reader) (*Node, error) {
+t := NewTokenizer(r)
+nodeTreeBuilder := NewNodeTreeBuilder()
 for {
-tt := z.Next()
-if tt == html.ErrorToken {
+tt := t.Advanced()
+if tt == html.ErrorToken{
 break
 }
 
-currentToken := z.Token()
-if strings.TrimSpace(currentToken.Data) == "" {
-continue
-}
-
-// token data depend on the token type.
-switch currentToken.Type {
-case html.EndTagToken:
-val, ok := stack.Pop()
-if !ok || val == nil {
-continue
-}
-currentNode = val.(*Node)
-case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken:
-var node *Node
-switch currentToken.Type {
-case html.TextToken:
-node = CreateTextNode(currentToken.Data)
-case html.DoctypeToken:
-node = CreateNode(DOCTYPEDTD)
-node.SetAttribute(currentToken.Data, "")
-default:
-node = CreateNode(currentToken.Data)
-for _, v := range currentToken.Attr {
-node.SetAttribute(v.Key, v.Val)
-}
-}
-
-if isTopNode(currentNode, stack){
-currentNode.AppendChild(node)
-}else{
-currentNode.Append(node)
-}
-
-if !node.IsTextNode() && !IsVoidTag(node.GetTagName()){
-stack.Push(node)
-}
-currentNode = node
-}
+nodeTreeBuilder.WriteNodeTree(t.CurrentNode(), tt)
 }
+return nodeTreeBuilder.GetRootNode(), nil
 
-node := rootNode.GetNextNode()
-rootNode.RemoveNode()
-
-return node, nil
 }
 
 // HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError.
@@ -75,12 +30,4 @@ func HTMLToNodeTree(html string) (*Node, error) {
 return node, err
 }
 
-func isTopNode(node *Node, stack *linkedliststack.Stack) bool {
-val, ok := stack.Peek()
-if !ok || val == nil {
-return false
-}
 
-topNode := val.(*Node)
-return topNode == node
-}
@@ -172,8 +172,8 @@ func matchQueryTokens(node *Node, queryTokens []QueryToken) bool {
 return true
 }
 
-// QuerySelector returns the first node that matches with the give node.
-func (node *Node) QuerySelector(query string) *Node {
+// Query returns the first node that matches with the give query.
+func (node *Node) Query(query string) *Node {
 queryTokens := TokenizeQuery(query)
 
 traverser := NewTraverser(node)
@@ -188,8 +188,8 @@ func (node *Node) QuerySelector(query string) *Node {
 return res
 }
 
-// QuerySelectorAll returns a NodeList containing nodes that matched with the given query.
-func (node *Node) QuerySelectorAll(query string) NodeList{
+// QueryAll returns a NodeList containing nodes that matched with the given query.
+func (node *Node) QueryAll(query string) NodeList{
 nodeList := NewNodeList()
 queryTokens := TokenizeQuery(query)
 traverser := NewTraverser(node)

@@ -172,7 +172,7 @@ func TestQuerySelector(t *testing.T) {
 t.Fatal(err)
 return
 }
-imgEl := node.QuerySelector("img #idElement")
+imgEl := node.Query("img #idElement")
 imgSrc, _ := imgEl.GetAttribute("src")
 imgAlt, _ := imgEl.GetAttribute("alt")
 if imgSrc != "" || imgAlt != "" {
@@ -187,7 +187,7 @@ func TestQuerySelectorAll(t *testing.T) {
 return
 }
 
-nodeList := node.QuerySelectorAll("h2")
+nodeList := node.QueryAll("h2")
 if nodeList.Len() != 2{
 t.Fatal("")
 }

@@ -3,10 +3,11 @@ package GoHtml_test
 import (
 "strings"
 "testing"
+"os"
 GoHtml "github.com/udan-jayanith/GoHTML"
 )
 
-func TestEncode(t *testing.T) {
+func TestEncode1(t *testing.T) {
 body := GoHtml.CreateNode("body")
 h1 := GoHtml.CreateNode("h1")
 h1.AppendText("This is a heading")
@@ -20,4 +21,15 @@ func TestEncode(t *testing.T) {
 builder1 := &strings.Builder{}
 GoHtml.Encode(builder1, body)
 //It's hard compare exacted output. Because strings, prettier formats html code. htmlFormatter and prettier add extra stuffs to the html codes like dash in void tags. Exacted output is in the ./test-files/2.html.
+}
+
+func TestEncode2(t *testing.T) {
+file, err := os.Open("./test-files/1.html")
+if err != nil {
+t.Fatal("1.html does not exists.")
+}
+node, _ := GoHtml.Decode(file)
+var builder strings.Builder
+GoHtml.Encode(&builder, node)
+//It's hard compare exacted output. Because strings, prettier formats html code. htmlFormatter and prettier add extra stuffs to the html codes like dash in void tags. Exacted output is in the ./test-files/2.html.
 }
@@ -0,0 +1,123 @@
+package GoHtml
+
+import (
+"io"
+"strings"
+
+"github.com/emirpasic/gods/stacks/linkedliststack"
+"golang.org/x/net/html"
+)
+
+
+// Tokenizer contains a *html.Tokenizer.
+type Tokenizer struct {
+z *html.Tokenizer
+}
+
+// NewTokenizer returns a new Tokenizer.
+func NewTokenizer(r io.Reader) Tokenizer {
+return Tokenizer{
+z: html.NewTokenizer(r),
+}
+}
+
+// Advanced scans the next token and returns its type.
+func (t *Tokenizer) Advanced() html.TokenType {
+return t.z.Next()
+}
+
+// CurrentNode returns the current node. 
+// Returned value can be nil regardless of tt.
+func (t *Tokenizer) CurrentNode() *Node {
+currentToken := t.z.Token()
+if strings.TrimSpace(currentToken.Data) == "" {
+return nil
+}
+
+// token data depend on the token type.
+switch currentToken.Type {
+case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken:
+var node *Node
+switch currentToken.Type {
+case html.TextToken:
+node = CreateTextNode(currentToken.Data)
+case html.DoctypeToken:
+node = CreateNode(DOCTYPEDTD)
+node.SetAttribute(currentToken.Data, "")
+default:
+node = CreateNode(currentToken.Data)
+for _, v := range currentToken.Attr {
+node.SetAttribute(v.Key, v.Val)
+}
+}
+return node
+}
+return nil
+}
+
+// NodeTreeBuilder is used to build a node tree given a node and it's type.
+type NodeTreeBuilder struct {
+rootNode *Node
+stack *linkedliststack.Stack
+currentNode *Node
+}
+
+// NewNodeTreeBuilder returns a new NodeTreeBuilder.
+func NewNodeTreeBuilder() NodeTreeBuilder {
+rootNode := CreateTextNode("")
+return NodeTreeBuilder{
+rootNode: rootNode,
+currentNode: rootNode,
+stack: linkedliststack.New(),
+}
+}
+
+// WriteNodeTree append the node given html.TokenType
+func (ntb *NodeTreeBuilder) WriteNodeTree(node *Node, tt html.TokenType) {
+switch tt {
+case html.EndTagToken:
+val, ok := ntb.stack.Pop()
+if !ok || val == nil {
+return
+}
+ntb.currentNode = val.(*Node)
+case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken:
+if node == nil {
+return
+}
+
+if isTopNode(ntb.currentNode, ntb.stack) {
+ntb.currentNode.AppendChild(node)
+} else {
+ntb.currentNode.Append(node)
+}
+
+if !node.IsTextNode() && !IsVoidTag(node.GetTagName()) {
+ntb.stack.Push(node)
+}
+ntb.currentNode = node
+}
+}
+
+// GetRootNode returns the root node of the accumulated node tree and resets the NodeTreeBuilder.
+func (ntb *NodeTreeBuilder) GetRootNode() *Node {
+node := ntb.rootNode.GetNextNode()
+ntb.rootNode.RemoveNode()
+
+rootNode := CreateTextNode("")
+ntb.rootNode = rootNode
+ntb.currentNode = rootNode
+ntb.stack = linkedliststack.New()
+
+return node
+}
+
+func isTopNode(node *Node, stack *linkedliststack.Stack) bool {
+val, ok := stack.Peek()
+if !ok || val == nil {
+return false
+}
+
+topNode := val.(*Node)
+return topNode == node
+}