Skip to content

Commit 6bcd509

Browse files
authored
fixed-length fileformat schema validation (#112)
fixed-length fileformat schema validation
1 parent 0af2eca commit 6bcd509

14 files changed

+812
-24
lines changed

extensions/omniv21/fileformat/csv/format_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ func TestValidateSchema(t *testing.T) {
156156
}
157157

158158
func TestCreateFormatReader(t *testing.T) {
159-
r, err := NewCSVFileFormat("test-schema").CreateFormatReader(
159+
r, err := NewCSVFileFormat("test").CreateFormatReader(
160160
"test-input",
161161
strings.NewReader(
162162
lf("A|B|C")+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
{
2+
"file_declaration": {
3+
"envelopes": [
4+
{
5+
"name": "1",
6+
"by_header_footer": {
7+
"header": "^FILE-BEGIN$",
8+
"footer": "^FILE-BEGIN$"
9+
},
10+
"by_rows": null,
11+
"not_target": true,
12+
"columns": null
13+
},
14+
{
15+
"name": "2",
16+
"by_header_footer": {
17+
"header": "^DATA-BLOCK-BEGIN$",
18+
"footer": "^DATA-BLOCK-END$"
19+
},
20+
"by_rows": null,
21+
"not_target": false,
22+
"columns": [
23+
{
24+
"name": "abc",
25+
"start_pos": 1,
26+
"length": 3,
27+
"line_pattern": "^DATA:.*$"
28+
}
29+
]
30+
},
31+
{
32+
"name": "3",
33+
"by_header_footer": {
34+
"header": "^FILE-END$",
35+
"footer": "^FILE-END$"
36+
},
37+
"by_rows": null,
38+
"not_target": true,
39+
"columns": null
40+
}
41+
]
42+
},
43+
"XPath": ""
44+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"file_declaration": {
3+
"envelopes": [
4+
{
5+
"name": "1",
6+
"by_header_footer": null,
7+
"by_rows": 3,
8+
"not_target": false,
9+
"columns": [
10+
{
11+
"name": "abc",
12+
"start_pos": 1,
13+
"length": 10,
14+
"line_pattern": "^L01.*"
15+
},
16+
{
17+
"name": "efg",
18+
"start_pos": 3,
19+
"length": 5,
20+
"line_pattern": "^L03.*"
21+
}
22+
]
23+
}
24+
]
25+
},
26+
"XPath": ""
27+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"file_declaration": {
3+
"envelopes": [
4+
{
5+
"name": "1",
6+
"by_header_footer": null,
7+
"by_rows": null,
8+
"not_target": false,
9+
"columns": [
10+
{
11+
"name": "abc",
12+
"start_pos": 1,
13+
"length": 10,
14+
"line_pattern": null
15+
}
16+
]
17+
}
18+
]
19+
},
20+
"XPath": ".[abc != 'skip']"
21+
}

extensions/omniv21/fileformat/fixedlength/decl.go

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,18 @@ type byHeaderFooterDecl struct {
1212
}
1313

1414
type columnDecl struct {
15-
Name string `json:"name"`
16-
StartPos int `json:"start_pos"` // 1-based. and rune-based.
17-
Length int `json:"length"` // rune-based length.
18-
Line *string `json:"line"`
19-
}
20-
21-
type envelopeDecl struct {
22-
Name *string `json:"name"`
23-
ByHeaderFooter *byHeaderFooterDecl `json:"by_header_footer"`
24-
ByRows *int `json:"by_rows"`
25-
NotTarget bool `json:"not_target"`
26-
Columns []*columnDecl `json:"columns"`
27-
}
28-
29-
type fileDecl struct {
30-
Envelopes []*envelopeDecl `json:"envelopes"`
15+
Name string `json:"name"`
16+
StartPos int `json:"start_pos"` // 1-based. and rune-based.
17+
Length int `json:"length"` // rune-based length.
18+
LinePattern *string `json:"line_pattern"`
3119
}
3220

3321
func (c *columnDecl) lineMatch(line []byte) bool {
34-
if c.Line == nil {
22+
if c.LinePattern == nil {
3523
return true
3624
}
3725
// validated in validation code.
38-
r, _ := caches.GetRegex(*c.Line)
26+
r, _ := caches.GetRegex(*c.LinePattern)
3927
return r.Match(line)
4028
}
4129

@@ -53,6 +41,14 @@ func (c *columnDecl) lineToColumn(line []rune) []rune {
5341
return nil
5442
}
5543

44+
type envelopeDecl struct {
45+
Name *string `json:"name"`
46+
ByHeaderFooter *byHeaderFooterDecl `json:"by_header_footer"`
47+
ByRows *int `json:"by_rows"`
48+
NotTarget bool `json:"not_target"`
49+
Columns []*columnDecl `json:"columns"`
50+
}
51+
5652
func (e *envelopeDecl) byRows() int {
5753
if e.ByHeaderFooter != nil {
5854
panic(fmt.Sprintf("envelope '%s' type is not 'by_rows'", *e.Name))
@@ -62,3 +58,21 @@ func (e *envelopeDecl) byRows() int {
6258
}
6359
return *e.ByRows
6460
}
61+
62+
type fileDecl struct {
63+
Envelopes []*envelopeDecl `json:"envelopes"`
64+
}
65+
66+
type envelopeType int
67+
68+
const (
69+
envelopeTypeByRows envelopeType = iota
70+
envelopeTypeByHeaderFooter
71+
)
72+
73+
func (f *fileDecl) envelopeType() envelopeType {
74+
if f.Envelopes[0].ByHeaderFooter != nil {
75+
return envelopeTypeByHeaderFooter
76+
}
77+
return envelopeTypeByRows
78+
}

extensions/omniv21/fileformat/fixedlength/decl_test.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ import (
1010

1111
func TestColumnDecl_LineMatch(t *testing.T) {
1212
assert.True(t, (&columnDecl{}).lineMatch([]byte("test")))
13-
assert.False(t, (&columnDecl{Line: strs.StrPtr("^ABC.*$")}).lineMatch([]byte("test")))
14-
assert.True(t, (&columnDecl{Line: strs.StrPtr("^ABC.*$")}).lineMatch([]byte("ABCDEFG")))
13+
assert.False(t, (&columnDecl{LinePattern: strs.StrPtr("^ABC.*$")}).lineMatch([]byte("test")))
14+
assert.True(t, (&columnDecl{LinePattern: strs.StrPtr("^ABC.*$")}).lineMatch([]byte("ABCDEFG")))
1515
}
1616

1717
func TestColumnDecl_LineToColumn(t *testing.T) {
@@ -30,3 +30,24 @@ func TestEnvelopeDecl_ByRows(t *testing.T) {
3030
assert.Equal(t, 1, (&envelopeDecl{}).byRows())
3131
assert.Equal(t, 12, (&envelopeDecl{ByRows: testlib.IntPtr(12)}).byRows())
3232
}
33+
34+
func TestFileDecl_EnvelopeType(t *testing.T) {
35+
assert.Equal(t, envelopeTypeByHeaderFooter,
36+
(&fileDecl{
37+
Envelopes: []*envelopeDecl{
38+
{ByHeaderFooter: &byHeaderFooterDecl{}},
39+
},
40+
}).envelopeType())
41+
assert.Equal(t, envelopeTypeByRows,
42+
(&fileDecl{
43+
Envelopes: []*envelopeDecl{
44+
{ByRows: testlib.IntPtr(12)},
45+
},
46+
}).envelopeType())
47+
assert.Equal(t, envelopeTypeByRows,
48+
(&fileDecl{
49+
Envelopes: []*envelopeDecl{
50+
{},
51+
},
52+
}).envelopeType())
53+
}
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
package fixedlength
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"io"
7+
"strconv"
8+
"strings"
9+
10+
"github.com/jf-tech/go-corelib/caches"
11+
"github.com/jf-tech/go-corelib/strs"
12+
13+
"github.com/jf-tech/omniparser/errs"
14+
"github.com/jf-tech/omniparser/extensions/omniv21/fileformat"
15+
"github.com/jf-tech/omniparser/extensions/omniv21/transform"
16+
v21validation "github.com/jf-tech/omniparser/extensions/omniv21/validation"
17+
"github.com/jf-tech/omniparser/validation"
18+
)
19+
20+
const (
21+
fileFormatFixedLength = "fixed-length"
22+
)
23+
24+
type fixedLengthFileFormat struct {
25+
schemaName string
26+
autoGenEnvelopeNameIndex int
27+
}
28+
29+
// NewFixedLengthFileFormat creates a FileFormat for fixed-length files.
30+
func NewFixedLengthFileFormat(schemaName string) fileformat.FileFormat {
31+
return &fixedLengthFileFormat{schemaName: schemaName}
32+
}
33+
34+
type fixedLengthFormatRuntime struct {
35+
Decl *fileDecl `json:"file_declaration"`
36+
XPath string
37+
}
38+
39+
func (f *fixedLengthFileFormat) ValidateSchema(
40+
format string, schemaContent []byte, finalOutputDecl *transform.Decl) (interface{}, error) {
41+
if format != fileFormatFixedLength {
42+
return nil, errs.ErrSchemaNotSupported
43+
}
44+
err := validation.SchemaValidate(f.schemaName, schemaContent, v21validation.JSONSchemaFixedLengthFileDeclaration)
45+
if err != nil {
46+
// err is already context formatted.
47+
return nil, err
48+
}
49+
var runtime fixedLengthFormatRuntime
50+
_ = json.Unmarshal(schemaContent, &runtime) // JSON schema validation earlier guarantees Unmarshal success.
51+
err = f.validateFileDecl(runtime.Decl)
52+
if err != nil {
53+
// err is already context formatted.
54+
return nil, err
55+
}
56+
if finalOutputDecl == nil {
57+
return nil, f.FmtErr("'FINAL_OUTPUT' is missing")
58+
}
59+
runtime.XPath = strings.TrimSpace(strs.StrPtrOrElse(finalOutputDecl.XPath, ""))
60+
if runtime.XPath != "" {
61+
_, err := caches.GetXPathExpr(runtime.XPath)
62+
if err != nil {
63+
return nil, f.FmtErr("'FINAL_OUTPUT.xpath' (value: '%s') is invalid, err: %s",
64+
runtime.XPath, err.Error())
65+
}
66+
}
67+
return &runtime, nil
68+
}
69+
70+
func (f *fixedLengthFileFormat) validateFileDecl(decl *fileDecl) error {
71+
targetSeen := false
72+
namesSeen := map[string]bool{}
73+
for _, envelope := range decl.Envelopes {
74+
if targetSeen && !envelope.NotTarget {
75+
return f.FmtErr("cannot have more than one target envelope")
76+
}
77+
targetSeen = targetSeen || !envelope.NotTarget
78+
if envelope.Name == nil {
79+
f.autoGenEnvelopeNameIndex++
80+
envelope.Name = strs.StrPtr(strconv.Itoa(f.autoGenEnvelopeNameIndex))
81+
}
82+
if _, found := namesSeen[*envelope.Name]; found {
83+
return f.FmtErr("more than one envelope has the name '%s'", *envelope.Name)
84+
}
85+
namesSeen[*envelope.Name] = true
86+
if err := f.validateByHeaderFooter(envelope.ByHeaderFooter); err != nil {
87+
return err
88+
}
89+
if err := f.validateColumns(envelope.Columns); err != nil {
90+
return err
91+
}
92+
}
93+
if !targetSeen {
94+
return f.FmtErr("missing target envelope")
95+
}
96+
return nil
97+
}
98+
99+
func (f *fixedLengthFileFormat) validateByHeaderFooter(decl *byHeaderFooterDecl) error {
100+
if decl == nil {
101+
return nil
102+
}
103+
_, err := caches.GetRegex(decl.Header)
104+
if err != nil {
105+
return f.FmtErr("invalid 'header' regex '%s': %s", decl.Header, err.Error())
106+
}
107+
_, err = caches.GetRegex(decl.Footer)
108+
if err != nil {
109+
return f.FmtErr("invalid 'footer' regex '%s': %s", decl.Footer, err.Error())
110+
}
111+
return nil
112+
}
113+
114+
func (f *fixedLengthFileFormat) validateColumns(cols []*columnDecl) error {
115+
columnNamesSeen := map[string]bool{}
116+
for _, col := range cols {
117+
if _, found := columnNamesSeen[col.Name]; found {
118+
return f.FmtErr("more than one column has the name '%s'", col.Name)
119+
}
120+
columnNamesSeen[col.Name] = true
121+
if col.LinePattern != nil {
122+
if _, err := caches.GetRegex(*col.LinePattern); err != nil {
123+
return f.FmtErr("invalid 'line_pattern' regex '%s': %s", *col.LinePattern, err.Error())
124+
}
125+
}
126+
}
127+
return nil
128+
}
129+
130+
func (f *fixedLengthFileFormat) CreateFormatReader(
131+
name string, r io.Reader, runtime interface{}) (fileformat.FormatReader, error) {
132+
// TODO
133+
_ = runtime.(*fixedLengthFormatRuntime)
134+
return nil, nil
135+
}
136+
137+
func (f *fixedLengthFileFormat) FmtErr(format string, args ...interface{}) error {
138+
return fmt.Errorf("schema '%s': %s", f.schemaName, fmt.Sprintf(format, args...))
139+
}

0 commit comments

Comments
 (0)