Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion goawk.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,10 @@ Additional GoAWK features:
-E progfile load program, treat as last option, disable var=value args
-H parse header row and enable @"field" in CSV input mode
-h, --help show this help message
-i mode parse input into fields using CSV format (ignore FS and RS)
-i mode parse input into fields using CSV or JSON Lines format
(ignore FS and RS)
'csv|tsv [separator=<char>] [comment=<char>] [header]'
'jsonl' (JSON Lines: arrays map to $1/$2/..., objects to @"name")
-o mode use CSV output for print with args (ignore OFS and ORS)
'csv|tsv [separator=<char>]'
-N mode newline output translation: smart (default), raw, crlf
Expand Down
37 changes: 34 additions & 3 deletions interp/interp.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,13 @@ type Config struct {
// "encoding/csv" package, but FieldsPerRecord is not supported,
// LazyQuotes is always on, and TrimLeadingSpace is always off.
//
// You can also enable CSV or TSV input mode by setting INPUTMODE to "csv"
// or "tsv" in Vars or in the BEGIN block (those override this setting).
// If set to JSONLMode, each input line is parsed as a JSON value. JSON
// arrays map elements to $1, $2, etc. JSON objects map keys to named
// fields accessible via @"name" as well as $1, $2, etc.
//
// You can also enable CSV, TSV, or JSONL input mode by setting INPUTMODE
// to "csv", "tsv", or "jsonl" in Vars or in the BEGIN block (those
// override this setting).
//
// For further documentation about GoAWK's CSV support, see the full docs
// in "../docs/csv.md".
Expand Down Expand Up @@ -336,6 +341,12 @@ const (

// TSVMode uses tab-separated value mode for input or output.
TSVMode IOMode = 2

// JSONLMode uses JSON Lines format for input. Each line must be a JSON
// value (typically an array or object). JSON arrays map elements to $1,
// $2, etc. JSON objects map keys to named fields accessible via @"name"
// as well as $1, $2, etc. (in document key order).
JSONLMode IOMode = 3
)

// CSVInputConfig holds additional configuration for when InputMode is CSVMode
Expand Down Expand Up @@ -457,6 +468,10 @@ func (p *interp) setExecuteConfig(config *Config) error {
if p.csvInputConfig.Separator == 0 {
p.csvInputConfig.Separator = '\t'
}
case JSONLMode:
if p.csvInputConfig != (CSVInputConfig{}) {
return newError("CSV input configuration options are not supported in JSONL mode")
}
case DefaultMode:
if p.csvInputConfig != (CSVInputConfig{}) {
return newError("input mode configuration not valid in default input mode")
Expand Down Expand Up @@ -957,11 +972,19 @@ func (p *interp) getField(index int) value {
}
}

// Get the value of a field by name (for CSV/TSV mode), as in @"name".
// Get the value of a field by name (for CSV/TSV/JSONL mode), as in @"name".
func (p *interp) getFieldByName(name string) (value, error) {
if p.inputMode == JSONLMode {
// In JSONL mode, we must ensure fields are parsed (per-record field
// names come from each JSON object, not a fixed header).
p.ensureFields()
}
if p.fieldIndexes == nil {
// Lazily create map of field names to indexes.
if p.fieldNames == nil {
if p.inputMode == JSONLMode {
return null(), newError(`no field names for @"name" in JSONL mode; current record is not a JSON object`)
}
return null(), newError(`no field names for @; use -H or add "header" to INPUTMODE, and use "getline" first if in BEGIN`)
}
p.fieldIndexes = make(map[string]int, len(p.fieldNames))
Expand Down Expand Up @@ -1058,6 +1081,8 @@ func inputModeString(mode IOMode, csvConfig CSVInputConfig) string {
case TSVMode:
s = "tsv"
defaultSep = '\t'
case JSONLMode:
return "jsonl"
case DefaultMode:
return ""
}
Expand Down Expand Up @@ -1085,6 +1110,12 @@ func parseInputMode(s string) (mode IOMode, csvConfig CSVInputConfig, err error)
case "tsv":
mode = TSVMode
csvConfig.Separator = '\t'
case "jsonl":
mode = JSONLMode
if len(fields) > 1 {
return DefaultMode, CSVInputConfig{}, newError("jsonl input mode takes no options")
}
return mode, CSVInputConfig{}, nil
default:
return DefaultMode, CSVInputConfig{}, newError("invalid input mode %q", fields[0])
}
Expand Down
157 changes: 157 additions & 0 deletions interp/interp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2135,6 +2135,163 @@ BEGIN {
{`BEGIN { x="a"; @x += "y" }`, "", "", "parse error at 1:19: assigning @ expression not supported", nil},
}

// JSON Lines (JSONL) test cases
var jsonlTests = []csvTest{
// JSON arrays: elements map to $1, $2, etc.
{`BEGIN { INPUTMODE="jsonl" } { print $1, $2, $3 }`,
`["Bob", "Smith", 42]` + "\n" + `["Jane", "Brown", 37]`,
"Bob Smith 42\nJane Brown 37\n", "", nil},

// JSON boolean and null conversions
{`BEGIN { INPUTMODE="jsonl" } { print $1, $2, $3, $4 }`,
`[true, false, null, 3.14]`,
"1 0 3.14\n", "", nil},

// JSON objects: @"name" syntax
{`BEGIN { INPUTMODE="jsonl" } { print @"name", @"age" }`,
`{"name":"Bob","age":42}` + "\n" + `{"name":"Jane","age":37}`,
"Bob 42\nJane 37\n", "", nil},

// JSON objects: dynamic @x lookup
{`BEGIN { INPUTMODE="jsonl" } { x="name"; print @x, @"age" }`,
`{"name":"Alice","age":25}`,
"Alice 25\n", "", nil},

// JSON objects: missing key returns empty string
{`BEGIN { INPUTMODE="jsonl" } { print @"name", @"missing" }`,
`{"name":"Bob","age":42}`,
"Bob \n", "", nil},

// JSON objects: $1, $2, ... also work (in document order)
{`BEGIN { INPUTMODE="jsonl" } { print $1, $2 }`,
`{"name":"Bob","age":42}`,
"Bob 42\n", "", nil},

// JSON objects: FIELDS array is updated per record
{`BEGIN { INPUTMODE="jsonl" } { print FIELDS[1], FIELDS[2] }`,
`{"name":"Bob","age":42}` + "\n" + `{"city":"NY","zip":"10001"}`,
"name age\ncity zip\n", "", nil},

// JSON objects: NF works
{`BEGIN { INPUTMODE="jsonl" } { print NF }`,
`{"a":1,"b":2,"c":3}`,
"3\n", "", nil},

// JSON objects: $0 is the raw JSON line
{`BEGIN { INPUTMODE="jsonl" } { print $0 }`,
`{"name":"Bob","age":42}`,
`{"name":"Bob","age":42}` + "\n", "", nil},

// Nested objects/arrays are flattened with dot notation
{`BEGIN { INPUTMODE="jsonl" } { print @"arr.0", @"arr.1", @"arr.2" }`,
`{"arr":[1,2,3]}`,
"1 2 3\n", "", nil},
{`BEGIN { INPUTMODE="jsonl" } { print @"obj.x" }`,
`{"obj":{"x":1}}`,
"1\n", "", nil},

// Flattened keys are not accessible via the unflattened parent name
{`BEGIN { INPUTMODE="jsonl" } { print @"arr", @"obj" }`,
`{"arr":[1,2,3],"obj":{"x":1}}`,
" \n", "", nil},

// Deeply nested: @"a.b.c" and @"a.b.d.0"
{`BEGIN { INPUTMODE="jsonl" } { print @"a.b.c", @"a.b.d.0", @"a.b.d.1" }`,
`{"a":{"b":{"c":"hello","d":[10,20]}}}`,
"hello 10 20\n", "", nil},

// Flattened NF counts all scalar leaves
{`BEGIN { INPUTMODE="jsonl" } { print NF }`,
`{"a":1,"b":{"c":2,"d":3},"e":[4,5]}`,
"5\n", "", nil},

// FIELDS array contains flattened key paths
{`BEGIN { INPUTMODE="jsonl" } { for (i=1; i<=NF; i++) printf "%s=%s\n", FIELDS[i], $i }`,
`{"x":1,"y":{"z":2}}`,
"x=1\ny.z=2\n", "", nil},

// JSON objects: different keys per line (each line independent)
{`BEGIN { INPUTMODE="jsonl" } { print @"a", @"b", @"c" }`,
`{"a":"x","b":"y"}` + "\n" + `{"b":"p","c":"q"}`,
"x y \n p q\n", "", nil},

// INPUTMODE "jsonl" round-trips via INPUTMODE variable
{`BEGIN { INPUTMODE="jsonl"; print INPUTMODE }`, "", "jsonl\n", "", nil},

// NR and FNR work correctly in JSONL mode
{`BEGIN { INPUTMODE="jsonl" } { print NR, $1 }`,
`["a"]` + "\n" + `["b"]` + "\n" + `["c"]`,
"1 a\n2 b\n3 c\n", "", nil},

// Filtering works in JSONL mode
{`BEGIN { INPUTMODE="jsonl" } @"type"=="error" { print @"msg" }`,
`{"type":"info","msg":"ok"}` + "\n" + `{"type":"error","msg":"fail"}`,
"fail\n", "", nil},

// @"name" on a JSON array line returns an error
{`BEGIN { INPUTMODE="jsonl" } { print @"x" }`,
`["a","b"]`,
"", `no field names for @"name" in JSONL mode; current record is not a JSON object`, nil},

// Configure via interp.Config struct
{`{ print $1, $2 }`, `["hello","world"]`, "hello world\n", "", func(config *interp.Config) {
config.InputMode = interp.JSONLMode
}},
{`{ print @"k", @"v" }`, `{"k":"key","v":"val"}`, "key val\n", "", func(config *interp.Config) {
config.InputMode = interp.JSONLMode
}},

// $0 reassignment in JSONL mode re-parses as JSON
{`BEGIN { INPUTMODE="jsonl" } { $0 = "{\"x\":99}"; print @"x" }`,
`{"x":1}`,
"99\n", "", nil},

// NF works correctly for JSON arrays
{`BEGIN { INPUTMODE="jsonl" } { print NF }`,
`["a","b","c","d"]`,
"4\n", "", nil},

// Empty lines are skipped in JSONL mode
{`BEGIN { INPUTMODE="jsonl" } { print NR, $1 }`,
"\n" + `["a"]` + "\n\n" + `["b"]` + "\n",
"1 a\n2 b\n", "", nil},

// Unicode strings work in JSONL mode
{`BEGIN { INPUTMODE="jsonl" } { print @"name" }`,
`{"name":"日本語"}`,
"日本語\n", "", nil},

// JSON strings with escape sequences are unescaped
{`BEGIN { INPUTMODE="jsonl" } { print @"s" }`,
`{"s":"hello\nworld"}`,
"hello\nworld\n", "", nil},

// Arithmetic on JSON numbers
{`BEGIN { INPUTMODE="jsonl" } { print @"a" + @"b" }`,
`{"a":10,"b":32}`,
"42\n", "", nil},

// Boolean comparison on true/false values
{`BEGIN { INPUTMODE="jsonl" } { print ($1 == 1), ($2 == 0) }`,
`[true, false]`,
"1 1\n", "", nil},

// INPUTMODE "jsonl" option parsing error
{`BEGIN { INPUTMODE="jsonl foo" }`, "", "", `jsonl input mode takes no options`, nil},
}

func TestJSONL(t *testing.T) {
for _, test := range jsonlTests {
testName := test.src
if len(testName) > 70 {
testName = testName[:70]
}
t.Run(testName, func(t *testing.T) {
testGoAWK(t, test.src, test.in, test.out, test.err, nil, test.configure)
})
}
}

func TestCSV(t *testing.T) {
for _, test := range csvTests {
testName := test.src
Expand Down
20 changes: 19 additions & 1 deletion interp/io.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,13 @@ func (p *interp) newScanner(input io.Reader, buffer []byte) *bufio.Scanner {
setFieldNames: p.setFieldNames,
}
scanner.Split(splitter.scan)
case p.inputMode == JSONLMode:
splitter := &jsonlSplitter{
fields: &p.fields,
setFieldNames: p.setFieldNames,
interp: p,
}
scanner.Split(splitter.scan)
case p.recordSep == "\n":
// Scanner default is to split on newlines
case p.recordSep == "":
Expand All @@ -274,7 +281,8 @@ func (p *interp) newScanner(input io.Reader, buffer []byte) *bufio.Scanner {
}

// setFieldNames is called by csvSplitter.scan on the first row (if the
// "header" option is specified).
// "header" option is specified), and by parseJSONLine for each JSON object
// record. If names is nil, field names are cleared.
func (p *interp) setFieldNames(names []string) {
p.fieldNames = names
p.fieldIndexes = nil // clear name-to-index cache
Expand Down Expand Up @@ -685,6 +693,16 @@ func (p *interp) ensureFields() {
p.fields = nil
}
}
case p.inputMode == JSONLMode:
// Normally fields have already been parsed by jsonlSplitter.
// Only re-parse if $0 was explicitly assigned (reparseCSV flag).
if p.reparseCSV {
if err := p.parseJSONLine(p.line); err != nil {
fmt.Fprintf(p.errorOutput, "goawk: %s\n", err)
p.fields = nil
p.setFieldNames(nil)
}
}
case p.savedFieldSep == " ":
// FS space (default) means split fields on any whitespace
p.fields = strings.Fields(p.line)
Expand Down
Loading