Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 2 additions & 12 deletions FUTURE-CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,2 @@
## v0.0.0-beta.3 <- current

## v0.0.1
* GetElementByClassName
* GetElementByTagName
* GetElementById
* NodeList
* GetElementsByClassName
* GetElementsByTagName
* GetElementsById
* QuerySelector
* QuerySelectorAll
## v0.0.3
- Closest
12 changes: 5 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# GoHTML

A powerful and comprehensive HTML parser and DOM manipulation library for Go, bringing JavaScript-like DOM operations to the Go ecosystem.
A HTML parse and a serializer for Go. GoHTML tries to keep semantic similar to JS-DOM API while trying to keep the API simple by not forcing JS-DOM model into GoHTML. Because of this GoHTML has node tree model. GoHTML tokenizer uses std net/html module for tokenizing in underlining layer. There for it's users responsibility to make sure inputs to GoHTML is UTF-8 encoded. GoHTML allows direct access to the node tree.

## Installation

Expand Down Expand Up @@ -50,12 +50,10 @@ Heres an example of fetching a website and parsing and then using querying metho
## Changelog

Changes, bug fixes and new features in this version.

- add: NodeList
- add: Querying helper functions
- add: ClassList
- bug fix: Empty attribute value parsing bug fixed
- changed: Renamed GetTraverser to NewTraverser
- add: Tokenizer
- add: NodeTreeBuilder
- renamed: QuerySelector to Query
- renamed: QuerySelectorAll to QueryAll

## Documentation

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ func TestFetchPostCovers(t *testing.T){
t.Fatal(err)
}

nodeList := node.GetElementsByClassName("post-title")
nodeList := node.QueryAll(".sm-feat .clearfix article")
t.Log("Got ", nodeList.Len(), " post titles.")
iter := nodeList.IterNodeList()
for node := range iter{
Expand Down
3 changes: 1 addition & 2 deletions main.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
/*
A powerful and comprehensive HTML parser and DOM manipulation library for Go,
bringing JavaScript-like DOM operations to the Go ecosystem.
A HTML parse and a serializer for Go. GoHTML tries to keep semantic similar to JS-DOM API while trying to keep the API simple by not forcing JS-DOM model into GoHTML. Because of this GoHTML has node tree model. GoHTML tokenizer uses std net/html module for tokenizing in underlining layer. There for it's users responsibility to make sure inputs to GoHTML is UTF-8 encoded. GoHTML allows direct access to the node tree.
*/
package GoHtml

Expand Down
69 changes: 8 additions & 61 deletions parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,69 +3,24 @@ package GoHtml
import (
"io"
"strings"

"github.com/emirpasic/gods/stacks/linkedliststack"
"golang.org/x/net/html"
)

// Decode reads from rd and create a node-tree. Then returns the root node and an error. If error were to occur it would be SyntaxError.
func Decode(r io.Reader) (*Node, error) {
rootNode := CreateTextNode("")
stack := linkedliststack.New()
currentNode := rootNode

z := html.NewTokenizer(r)
// Decode reads from rd and create a node-tree. Then returns the root node and nil.
func Decode(r io.Reader) (*Node, error) {
t := NewTokenizer(r)
nodeTreeBuilder := NewNodeTreeBuilder()
for {
tt := z.Next()
if tt == html.ErrorToken {
tt := t.Advanced()
if tt == html.ErrorToken{
break
}

currentToken := z.Token()
if strings.TrimSpace(currentToken.Data) == "" {
continue
}

// token data depend on the token type.
switch currentToken.Type {
case html.EndTagToken:
val, ok := stack.Pop()
if !ok || val == nil {
continue
}
currentNode = val.(*Node)
case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken:
var node *Node
switch currentToken.Type {
case html.TextToken:
node = CreateTextNode(currentToken.Data)
case html.DoctypeToken:
node = CreateNode(DOCTYPEDTD)
node.SetAttribute(currentToken.Data, "")
default:
node = CreateNode(currentToken.Data)
for _, v := range currentToken.Attr {
node.SetAttribute(v.Key, v.Val)
}
}

if isTopNode(currentNode, stack){
currentNode.AppendChild(node)
}else{
currentNode.Append(node)
}

if !node.IsTextNode() && !IsVoidTag(node.GetTagName()){
stack.Push(node)
}
currentNode = node
}
nodeTreeBuilder.WriteNodeTree(t.CurrentNode(), tt)
}
return nodeTreeBuilder.GetRootNode(), nil

node := rootNode.GetNextNode()
rootNode.RemoveNode()

return node, nil
}

// HTMLToNodeTree return html code as a node-tree. If error were to occur it would be SyntaxError.
Expand All @@ -75,12 +30,4 @@ func HTMLToNodeTree(html string) (*Node, error) {
return node, err
}

func isTopNode(node *Node, stack *linkedliststack.Stack) bool {
val, ok := stack.Peek()
if !ok || val == nil {
return false
}

topNode := val.(*Node)
return topNode == node
}
8 changes: 4 additions & 4 deletions querying.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,8 @@ func matchQueryTokens(node *Node, queryTokens []QueryToken) bool {
return true
}

// QuerySelector returns the first node that matches with the give node.
func (node *Node) QuerySelector(query string) *Node {
// Query returns the first node that matches with the give query.
func (node *Node) Query(query string) *Node {
queryTokens := TokenizeQuery(query)

traverser := NewTraverser(node)
Expand All @@ -188,8 +188,8 @@ func (node *Node) QuerySelector(query string) *Node {
return res
}

// QuerySelectorAll returns a NodeList containing nodes that matched with the given query.
func (node *Node) QuerySelectorAll(query string) NodeList{
// QueryAll returns a NodeList containing nodes that matched with the given query.
func (node *Node) QueryAll(query string) NodeList{
nodeList := NewNodeList()
queryTokens := TokenizeQuery(query)
traverser := NewTraverser(node)
Expand Down
4 changes: 2 additions & 2 deletions querying_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ func TestQuerySelector(t *testing.T) {
t.Fatal(err)
return
}
imgEl := node.QuerySelector("img #idElement")
imgEl := node.Query("img #idElement")
imgSrc, _ := imgEl.GetAttribute("src")
imgAlt, _ := imgEl.GetAttribute("alt")
if imgSrc != "" || imgAlt != "" {
Expand All @@ -187,7 +187,7 @@ func TestQuerySelectorAll(t *testing.T) {
return
}

nodeList := node.QuerySelectorAll("h2")
nodeList := node.QueryAll("h2")
if nodeList.Len() != 2{
t.Fatal("")
}
Expand Down
14 changes: 13 additions & 1 deletion serializer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ package GoHtml_test
import (
"strings"
"testing"
"os"
GoHtml "github.com/udan-jayanith/GoHTML"
)

func TestEncode(t *testing.T) {
func TestEncode1(t *testing.T) {
body := GoHtml.CreateNode("body")
h1 := GoHtml.CreateNode("h1")
h1.AppendText("This is a heading")
Expand All @@ -20,4 +21,15 @@ func TestEncode(t *testing.T) {
builder1 := &strings.Builder{}
GoHtml.Encode(builder1, body)
//It's hard compare exacted output. Because strings, prettier formats html code. htmlFormatter and prettier add extra stuffs to the html codes like dash in void tags. Exacted output is in the ./test-files/2.html.
}

func TestEncode2(t *testing.T) {
file, err := os.Open("./test-files/1.html")
if err != nil {
t.Fatal("1.html does not exists.")
}
node, _ := GoHtml.Decode(file)
var builder strings.Builder
GoHtml.Encode(&builder, node)
//It's hard compare exacted output. Because strings, prettier formats html code. htmlFormatter and prettier add extra stuffs to the html codes like dash in void tags. Exacted output is in the ./test-files/2.html.
}
123 changes: 123 additions & 0 deletions tokenizer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package GoHtml

import (
"io"
"strings"

"github.com/emirpasic/gods/stacks/linkedliststack"
"golang.org/x/net/html"
)


// Tokenizer contains a *html.Tokenizer.
type Tokenizer struct {
z *html.Tokenizer
}

// NewTokenizer returns a new Tokenizer.
func NewTokenizer(r io.Reader) Tokenizer {
return Tokenizer{
z: html.NewTokenizer(r),
}
}

// Advanced scans the next token and returns its type.
func (t *Tokenizer) Advanced() html.TokenType {
return t.z.Next()
}

// CurrentNode returns the current node.
// Returned value can be nil regardless of tt.
func (t *Tokenizer) CurrentNode() *Node {
currentToken := t.z.Token()
if strings.TrimSpace(currentToken.Data) == "" {
return nil
}

// token data depend on the token type.
switch currentToken.Type {
case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken:
var node *Node
switch currentToken.Type {
case html.TextToken:
node = CreateTextNode(currentToken.Data)
case html.DoctypeToken:
node = CreateNode(DOCTYPEDTD)
node.SetAttribute(currentToken.Data, "")
default:
node = CreateNode(currentToken.Data)
for _, v := range currentToken.Attr {
node.SetAttribute(v.Key, v.Val)
}
}
return node
}
return nil
}

// NodeTreeBuilder is used to build a node tree given a node and it's type.
type NodeTreeBuilder struct {
rootNode *Node
stack *linkedliststack.Stack
currentNode *Node
}

// NewNodeTreeBuilder returns a new NodeTreeBuilder.
func NewNodeTreeBuilder() NodeTreeBuilder {
rootNode := CreateTextNode("")
return NodeTreeBuilder{
rootNode: rootNode,
currentNode: rootNode,
stack: linkedliststack.New(),
}
}

// WriteNodeTree append the node given html.TokenType
func (ntb *NodeTreeBuilder) WriteNodeTree(node *Node, tt html.TokenType) {
switch tt {
case html.EndTagToken:
val, ok := ntb.stack.Pop()
if !ok || val == nil {
return
}
ntb.currentNode = val.(*Node)
case html.DoctypeToken, html.StartTagToken, html.SelfClosingTagToken, html.TextToken:
if node == nil {
return
}

if isTopNode(ntb.currentNode, ntb.stack) {
ntb.currentNode.AppendChild(node)
} else {
ntb.currentNode.Append(node)
}

if !node.IsTextNode() && !IsVoidTag(node.GetTagName()) {
ntb.stack.Push(node)
}
ntb.currentNode = node
}
}

// GetRootNode returns the root node of the accumulated node tree and resets the NodeTreeBuilder.
func (ntb *NodeTreeBuilder) GetRootNode() *Node {
node := ntb.rootNode.GetNextNode()
ntb.rootNode.RemoveNode()

rootNode := CreateTextNode("")
ntb.rootNode = rootNode
ntb.currentNode = rootNode
ntb.stack = linkedliststack.New()

return node
}

func isTopNode(node *Node, stack *linkedliststack.Stack) bool {
val, ok := stack.Peek()
if !ok || val == nil {
return false
}

topNode := val.(*Node)
return topNode == node
}