Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ require (
github.com/bradleyjkemp/cupaloy v2.3.0+incompatible
github.com/hashicorp/golang-lru v0.5.4
github.com/stretchr/testify v1.6.1
golang.org/x/text v0.3.0
)
25 changes: 25 additions & 0 deletions omniparser/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package omniparser

import (
"io"

"github.com/jf-tech/omniparser/omniparser/schemaPlugin"
"github.com/jf-tech/omniparser/omniparser/transformCtx"
)

// Parser is an interface that represents an instance of omniparser.
// One instance of Parser is associated with one and only one schema.
// The instance of Parser can be reused for parsing and transforming
// multiple input files/streams, as long as they are all intended for the
// same schema.
// Each parsing/transform, however, needs a separate instance of
// TransformOp. TransformOp must not be shared and reused across different
// input files/streams.
// While the same instance of Parser can be shared across multiple threads,
// TransformOp is not multi-thread safe. All operations on it must be done
// within the same go routine.
type Parser interface {
GetTransformOp(name string, input io.Reader, ctx *transformCtx.Ctx) (TransformOp, error)
SchemaHeader() schemaPlugin.Header
SchemaRawContent() string
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[
"iso-8859-1",
"utf-8",
"windows-1252"
]

50 changes: 50 additions & 0 deletions omniparser/schemaPlugin/header.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package schemaPlugin

import (
"io"

"golang.org/x/text/encoding/charmap"

"github.com/jf-tech/omniparser/strutil"
)

// ParserSettings defines the common header (and its JSON format) for all schemas across all schema plugins.
// It contains vital information about what schema plugin a schema wants to use, and what file format the
// input stream is of (e.g. fixed-length txt, CSV/TSV, XML, JSON, EDI, etc).
// Also optionally, it specifies the expected the encoding scheme for the input streams this schema is used
// for.
type ParserSettings struct {
Version string `json:"version,omitempty"`
FileFormatType string `json:"file_format_type,omitempty"`
Encoding *string `json:"encoding,omitempty"`
}

const (
// EncodingUTF8 is the UTF-8 (golang's default) encoding scheme.
EncodingUTF8 = "utf-8"
// EncodingISO8859_1 is the ISO 8859-1 encoding.
EncodingISO8859_1 = "iso-8859-1"
// EncodingWindows1252 is the Windows 1252 encoding.
EncodingWindows1252 = "windows-1252"
)

type encodingMappingFunc func(reader io.Reader) io.Reader

// SupportedEncodingMappings provides mapping between input stream reader and a func that does
// encoding specific translation.
var SupportedEncodingMappings = map[string]encodingMappingFunc{
EncodingUTF8: func(r io.Reader) io.Reader { return r },
EncodingISO8859_1: func(r io.Reader) io.Reader { return charmap.ISO8859_1.NewDecoder().Reader(r) },
EncodingWindows1252: func(r io.Reader) io.Reader { return charmap.Windows1252.NewDecoder().Reader(r) },
}

// GetEncoding returns the encoding of the schema. If no encoding is specified in the schema, which
// the most comment default case, it assumes the input stream will be in UTF-8.
func (p ParserSettings) GetEncoding() string {
return strutil.StrPtrOrElse(p.Encoding, EncodingUTF8)
}

// Header contains the common ParserSettings for all schemas.
type Header struct {
ParserSettings ParserSettings `json:"parser_settings,omitempty"`
}
46 changes: 46 additions & 0 deletions omniparser/schemaPlugin/header_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package schemaPlugin

import (
"io/ioutil"
"sort"
"strings"
"testing"

"github.com/bradleyjkemp/cupaloy"
"github.com/stretchr/testify/assert"

"github.com/jf-tech/omniparser/jsonutil"
"github.com/jf-tech/omniparser/testutil"
)

func TestSupportedEncodingMappingsDump(t *testing.T) {
var supported []string
for k, _ := range SupportedEncodingMappings {
supported = append(supported, k)
}
sort.Strings(supported)
cupaloy.SnapshotT(t, jsonutil.BPM(supported))
}

func TestSupportedEncodingMappings(t *testing.T) {
for encoding, mappingFn := range SupportedEncodingMappings {
t.Run(encoding, func(t *testing.T) {
actual, err := ioutil.ReadAll(mappingFn(strings.NewReader("test")))
assert.NoError(t, err)
assert.Equal(t, []byte("test"), actual)
})
}
}

func TestGetEncoding(t *testing.T) {
assert.Equal(
t, EncodingUTF8, (ParserSettings{Encoding: testutil.StrPtr(EncodingUTF8)}).GetEncoding())
assert.Equal(
t, EncodingISO8859_1, (ParserSettings{Encoding: testutil.StrPtr(EncodingISO8859_1)}).GetEncoding())
assert.Equal(
t, EncodingWindows1252, (ParserSettings{Encoding: testutil.StrPtr(EncodingWindows1252)}).GetEncoding())
assert.Equal(
t, EncodingUTF8, (ParserSettings{}).GetEncoding())
assert.Equal(
t, "whatever", (ParserSettings{Encoding: testutil.StrPtr("whatever")}).GetEncoding())
}
5 changes: 5 additions & 0 deletions omniparser/transformCtx/ctx.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package transformCtx

// Ctx contains the context object used throughout the lifespan of a TransformOp action.
type Ctx struct {
}
13 changes: 13 additions & 0 deletions omniparser/transformOp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package omniparser

// TransformOp is an interface that represents one input stream parsing/transform operation.
// Instance of TransformOp must not be shared and reused among different input streams.
// Instance of TransformOp should not be used across multiple goroutines.
type TransformOp interface {
// Next indicates whether the parsing/transform operation is completed or not.
Next() bool
// Read returns a JSON byte slice representing one parsing/transform result.
Read() ([]byte, error)
// Parser returns the Parser from which this TransformOp is created.
Parser() Parser
}
8 changes: 8 additions & 0 deletions strutil/strutil.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package strutil

func StrPtrOrElse(sp *string, orElse string) string {
if sp != nil {
return *sp
}
return orElse
}
14 changes: 14 additions & 0 deletions strutil/strutil_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package strutil

import (
"testing"

"github.com/stretchr/testify/assert"

"github.com/jf-tech/omniparser/testutil"
)

func TestStrPtrOrElse(t *testing.T) {
assert.Equal(t, "this", StrPtrOrElse(testutil.StrPtr("this"), "that"))
assert.Equal(t, "that", StrPtrOrElse(nil, "that"))
}
13 changes: 13 additions & 0 deletions testutil/testutil.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package testutil

// IntPtr returns an int pointer with a given value.
// Tests cases needed inline int pointer declaration can use this.
func IntPtr(n int) *int {
return &n
}

// StrPtr returns a string pointer with a given value.
// Tests cases needed inline string pointer declaration can use this.
func StrPtr(s string) *string {
return &s
}
19 changes: 19 additions & 0 deletions testutil/testutil_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package testutil

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestIntPtr(t *testing.T) {
np := IntPtr(31415926)
assert.NotNil(t, np)
assert.Equal(t, 31415926, *np)
}

func TestStrPtr(t *testing.T) {
sp := StrPtr("pi")
assert.NotNil(t, sp)
assert.Equal(t, "pi", *sp)
}