From fa3562dc970d59ea6df32644743346623369f0ff Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Apr 2026 20:04:42 +0000 Subject: [PATCH 1/4] Initial plan From af509fd74b0ae1ccd39fb25c92af7709c53af254 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Apr 2026 20:19:38 +0000 Subject: [PATCH 2/4] Add JSON Lines (JSONL) input mode support Agent-Logs-Url: https://github.com/benhoyt/goawk/sessions/29bc6e5b-0551-41a0-83cf-9d679344b4d0 Co-authored-by: benhoyt <999033+benhoyt@users.noreply.github.com> --- goawk.go | 4 +- interp/interp.go | 37 +++++++- interp/interp_test.go | 134 +++++++++++++++++++++++++++ interp/io.go | 20 +++- interp/jsonl.go | 206 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 396 insertions(+), 5 deletions(-) create mode 100644 interp/jsonl.go diff --git a/goawk.go b/goawk.go index 35356926..7deae358 100644 --- a/goawk.go +++ b/goawk.go @@ -61,8 +61,10 @@ Additional GoAWK features: -E progfile load program, treat as last option, disable var=value args -H parse header row and enable @"field" in CSV input mode -h, --help show this help message - -i mode parse input into fields using CSV format (ignore FS and RS) + -i mode parse input into fields using CSV or JSON Lines format + (ignore FS and RS) 'csv|tsv [separator=] [comment=] [header]' + 'jsonl' (JSON Lines: arrays map to $1/$2/..., objects to @"name") -o mode use CSV output for print with args (ignore OFS and ORS) 'csv|tsv [separator=]' -N mode newline output translation: smart (default), raw, crlf diff --git a/interp/interp.go b/interp/interp.go index 121ff9be..50ed53e7 100644 --- a/interp/interp.go +++ b/interp/interp.go @@ -276,8 +276,13 @@ type Config struct { // "encoding/csv" package, but FieldsPerRecord is not supported, // LazyQuotes is always on, and TrimLeadingSpace is always off. // - // You can also enable CSV or TSV input mode by setting INPUTMODE to "csv" - // or "tsv" in Vars or in the BEGIN block (those override this setting). + // If set to JSONLMode, each input line is parsed as a JSON value. JSON + // arrays map elements to $1, $2, etc. JSON objects map keys to named + // fields accessible via @"name" as well as $1, $2, etc. + // + // You can also enable CSV, TSV, or JSONL input mode by setting INPUTMODE + // to "csv", "tsv", or "jsonl" in Vars or in the BEGIN block (those + // override this setting). // // For further documentation about GoAWK's CSV support, see the full docs // in "../docs/csv.md". @@ -336,6 +341,12 @@ const ( // TSVMode uses tab-separated value mode for input or output. TSVMode IOMode = 2 + + // JSONLMode uses JSON Lines format for input. Each line must be a JSON + // value (typically an array or object). JSON arrays map elements to $1, + // $2, etc. JSON objects map keys to named fields accessible via @"name" + // as well as $1, $2, etc. (in document key order). + JSONLMode IOMode = 3 ) // CSVInputConfig holds additional configuration for when InputMode is CSVMode @@ -457,6 +468,10 @@ func (p *interp) setExecuteConfig(config *Config) error { if p.csvInputConfig.Separator == 0 { p.csvInputConfig.Separator = '\t' } + case JSONLMode: + if p.csvInputConfig != (CSVInputConfig{}) { + return newError("input mode configuration not valid in jsonl input mode") + } case DefaultMode: if p.csvInputConfig != (CSVInputConfig{}) { return newError("input mode configuration not valid in default input mode") @@ -957,11 +972,19 @@ func (p *interp) getField(index int) value { } } -// Get the value of a field by name (for CSV/TSV mode), as in @"name". +// Get the value of a field by name (for CSV/TSV/JSONL mode), as in @"name". func (p *interp) getFieldByName(name string) (value, error) { + if p.inputMode == JSONLMode { + // In JSONL mode, we must ensure fields are parsed (per-record field + // names come from each JSON object, not a fixed header). + p.ensureFields() + } if p.fieldIndexes == nil { // Lazily create map of field names to indexes. if p.fieldNames == nil { + if p.inputMode == JSONLMode { + return null(), newError(`no field names for @"name" in JSONL mode; current record is not a JSON object`) + } return null(), newError(`no field names for @; use -H or add "header" to INPUTMODE, and use "getline" first if in BEGIN`) } p.fieldIndexes = make(map[string]int, len(p.fieldNames)) @@ -1058,6 +1081,8 @@ func inputModeString(mode IOMode, csvConfig CSVInputConfig) string { case TSVMode: s = "tsv" defaultSep = '\t' + case JSONLMode: + return "jsonl" case DefaultMode: return "" } @@ -1085,6 +1110,12 @@ func parseInputMode(s string) (mode IOMode, csvConfig CSVInputConfig, err error) case "tsv": mode = TSVMode csvConfig.Separator = '\t' + case "jsonl": + mode = JSONLMode + if len(fields) > 1 { + return DefaultMode, CSVInputConfig{}, newError("jsonl input mode takes no options") + } + return mode, CSVInputConfig{}, nil default: return DefaultMode, CSVInputConfig{}, newError("invalid input mode %q", fields[0]) } diff --git a/interp/interp_test.go b/interp/interp_test.go index a511525f..7cc00f49 100644 --- a/interp/interp_test.go +++ b/interp/interp_test.go @@ -2135,6 +2135,140 @@ BEGIN { {`BEGIN { x="a"; @x += "y" }`, "", "", "parse error at 1:19: assigning @ expression not supported", nil}, } +// JSON Lines (JSONL) test cases +var jsonlTests = []csvTest{ + // JSON arrays: elements map to $1, $2, etc. + {`BEGIN { INPUTMODE="jsonl" } { print $1, $2, $3 }`, + `["Bob", "Smith", 42]` + "\n" + `["Jane", "Brown", 37]`, + "Bob Smith 42\nJane Brown 37\n", "", nil}, + + // JSON boolean and null conversions + {`BEGIN { INPUTMODE="jsonl" } { print $1, $2, $3, $4 }`, + `[true, false, null, 3.14]`, + "1 0 3.14\n", "", nil}, + + // JSON objects: @"name" syntax + {`BEGIN { INPUTMODE="jsonl" } { print @"name", @"age" }`, + `{"name":"Bob","age":42}` + "\n" + `{"name":"Jane","age":37}`, + "Bob 42\nJane 37\n", "", nil}, + + // JSON objects: dynamic @x lookup + {`BEGIN { INPUTMODE="jsonl" } { x="name"; print @x, @"age" }`, + `{"name":"Alice","age":25}`, + "Alice 25\n", "", nil}, + + // JSON objects: missing key returns empty string + {`BEGIN { INPUTMODE="jsonl" } { print @"name", @"missing" }`, + `{"name":"Bob","age":42}`, + "Bob \n", "", nil}, + + // JSON objects: $1, $2, ... also work (in document order) + {`BEGIN { INPUTMODE="jsonl" } { print $1, $2 }`, + `{"name":"Bob","age":42}`, + "Bob 42\n", "", nil}, + + // JSON objects: FIELDS array is updated per record + {`BEGIN { INPUTMODE="jsonl" } { print FIELDS[1], FIELDS[2] }`, + `{"name":"Bob","age":42}` + "\n" + `{"city":"NY","zip":"10001"}`, + "name age\ncity zip\n", "", nil}, + + // JSON objects: NF works + {`BEGIN { INPUTMODE="jsonl" } { print NF }`, + `{"a":1,"b":2,"c":3}`, + "3\n", "", nil}, + + // JSON objects: $0 is the raw JSON line + {`BEGIN { INPUTMODE="jsonl" } { print $0 }`, + `{"name":"Bob","age":42}`, + `{"name":"Bob","age":42}` + "\n", "", nil}, + + // Nested objects/arrays are returned as JSON strings + {`BEGIN { INPUTMODE="jsonl" } { print @"arr", @"obj" }`, + `{"arr":[1,2,3],"obj":{"x":1}}`, + "[1,2,3] {\"x\":1}\n", "", nil}, + + // JSON objects: different keys per line (each line independent) + {`BEGIN { INPUTMODE="jsonl" } { print @"a", @"b", @"c" }`, + `{"a":"x","b":"y"}` + "\n" + `{"b":"p","c":"q"}`, + "x y \n p q\n", "", nil}, + + // INPUTMODE "jsonl" round-trips via INPUTMODE variable + {`BEGIN { INPUTMODE="jsonl"; print INPUTMODE }`, "", "jsonl\n", "", nil}, + + // NR and FNR work correctly in JSONL mode + {`BEGIN { INPUTMODE="jsonl" } { print NR, $1 }`, + `["a"]` + "\n" + `["b"]` + "\n" + `["c"]`, + "1 a\n2 b\n3 c\n", "", nil}, + + // Filtering works in JSONL mode + {`BEGIN { INPUTMODE="jsonl" } @"type"=="error" { print @"msg" }`, + `{"type":"info","msg":"ok"}` + "\n" + `{"type":"error","msg":"fail"}`, + "fail\n", "", nil}, + + // @"name" on a JSON array line returns an error + {`BEGIN { INPUTMODE="jsonl" } { print @"x" }`, + `["a","b"]`, + "", `no field names for @"name" in JSONL mode; current record is not a JSON object`, nil}, + + // Configure via interp.Config struct + {`{ print $1, $2 }`, `["hello","world"]`, "hello world\n", "", func(config *interp.Config) { + config.InputMode = interp.JSONLMode + }}, + {`{ print @"k", @"v" }`, `{"k":"key","v":"val"}`, "key val\n", "", func(config *interp.Config) { + config.InputMode = interp.JSONLMode + }}, + + // $0 reassignment in JSONL mode re-parses as JSON + {`BEGIN { INPUTMODE="jsonl" } { $0 = "{\"x\":99}"; print @"x" }`, + `{"x":1}`, + "99\n", "", nil}, + + // NF works correctly for JSON arrays + {`BEGIN { INPUTMODE="jsonl" } { print NF }`, + `["a","b","c","d"]`, + "4\n", "", nil}, + + // Empty lines are skipped in JSONL mode + {`BEGIN { INPUTMODE="jsonl" } { print NR, $1 }`, + "\n" + `["a"]` + "\n\n" + `["b"]` + "\n", + "1 a\n2 b\n", "", nil}, + + // Unicode strings work in JSONL mode + {`BEGIN { INPUTMODE="jsonl" } { print @"name" }`, + `{"name":"日本語"}`, + "日本語\n", "", nil}, + + // JSON strings with escape sequences are unescaped + {`BEGIN { INPUTMODE="jsonl" } { print @"s" }`, + `{"s":"hello\nworld"}`, + "hello\nworld\n", "", nil}, + + // Arithmetic on JSON numbers + {`BEGIN { INPUTMODE="jsonl" } { print @"a" + @"b" }`, + `{"a":10,"b":32}`, + "42\n", "", nil}, + + // Boolean comparison on true/false values + {`BEGIN { INPUTMODE="jsonl" } { print ($1 == 1), ($2 == 0) }`, + `[true, false]`, + "1 1\n", "", nil}, + + // INPUTMODE "jsonl" option parsing error + {`BEGIN { INPUTMODE="jsonl foo" }`, "", "", `jsonl input mode takes no options`, nil}, +} + +func TestJSONL(t *testing.T) { + for _, test := range jsonlTests { + testName := test.src + if len(testName) > 70 { + testName = testName[:70] + } + t.Run(testName, func(t *testing.T) { + testGoAWK(t, test.src, test.in, test.out, test.err, nil, test.configure) + }) + } +} + func TestCSV(t *testing.T) { for _, test := range csvTests { testName := test.src diff --git a/interp/io.go b/interp/io.go index 72e9af57..64dc9b2b 100644 --- a/interp/io.go +++ b/interp/io.go @@ -255,6 +255,13 @@ func (p *interp) newScanner(input io.Reader, buffer []byte) *bufio.Scanner { setFieldNames: p.setFieldNames, } scanner.Split(splitter.scan) + case p.inputMode == JSONLMode: + splitter := &jsonlSplitter{ + fields: &p.fields, + setFieldNames: p.setFieldNames, + interp: p, + } + scanner.Split(splitter.scan) case p.recordSep == "\n": // Scanner default is to split on newlines case p.recordSep == "": @@ -274,7 +281,8 @@ func (p *interp) newScanner(input io.Reader, buffer []byte) *bufio.Scanner { } // setFieldNames is called by csvSplitter.scan on the first row (if the -// "header" option is specified). +// "header" option is specified), and by parseJSONLine for each JSON object +// record. If names is nil, field names are cleared. func (p *interp) setFieldNames(names []string) { p.fieldNames = names p.fieldIndexes = nil // clear name-to-index cache @@ -685,6 +693,16 @@ func (p *interp) ensureFields() { p.fields = nil } } + case p.inputMode == JSONLMode: + // Normally fields have already been parsed by jsonlSplitter. + // Only re-parse if $0 was explicitly assigned (reparseCSV flag). + if p.reparseCSV { + if err := p.parseJSONLine(p.line); err != nil { + fmt.Fprintf(p.errorOutput, "goawk: %s\n", err) + p.fields = nil + p.setFieldNames(nil) + } + } case p.savedFieldSep == " ": // FS space (default) means split fields on any whitespace p.fields = strings.Fields(p.line) diff --git a/interp/jsonl.go b/interp/jsonl.go new file mode 100644 index 00000000..8c93699f --- /dev/null +++ b/interp/jsonl.go @@ -0,0 +1,206 @@ +// JSON Lines input parsing for GoAWK interpreter. + +package interp + +import ( + "bytes" + "encoding/json" + "fmt" +) + +// jsonlSplitter is a bufio.Scanner split function for JSON Lines input. +// It splits on newlines, skipping empty lines, and pre-parses each JSON line +// into fields (like csvSplitter does for CSV). This ensures that FIELDS and +// other per-record state are populated before each action body runs. +type jsonlSplitter struct { + fields *[]string + setFieldNames func(names []string) + interp *interp // for error reporting +} + +func (s *jsonlSplitter) scan(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + // Find and skip empty lines; stop at first non-empty line. + skip := 0 + var line []byte + for { + newline := bytes.IndexByte(data, '\n') + var lineEnd int + if newline >= 0 { + lineEnd = newline + 1 + } else if atEOF { + lineEnd = len(data) + } else { + return 0, nil, nil // need more data + } + + candidate := dropCR(data[:lineEnd-lenNewline(data[:lineEnd])]) + if len(bytes.TrimSpace(candidate)) > 0 { + line = candidate + advance += lineEnd + break + } + // Empty line: skip it + advance += lineEnd + skip += lineEnd + data = data[lineEnd:] + if atEOF && len(data) == 0 { + return advance, nil, nil + } + } + + // Parse the JSON line and populate fields / field names. + fields, names, parseErr := parseJSONLineToFields(line) + if parseErr != nil { + if s.interp != nil { + fmt.Fprintf(s.interp.errorOutput, "goawk: %s\n", parseErr) + } + fields = nil + names = nil + } + *s.fields = fields + s.setFieldNames(names) + + return advance, line, nil +} + +// parseJSONLine parses a JSON line and populates p.fields and p.fieldNames. +// Called by ensureFields() when $0 is reassigned in JSONL mode. +func (p *interp) parseJSONLine(line string) error { + fields, names, err := parseJSONLineToFields([]byte(line)) + if err != nil { + return err + } + p.fields = fields + p.setFieldNames(names) + return nil +} + +// parseJSONLineToFields parses a JSON line and returns the field values and +// (for objects) the field names. For arrays, names is nil. +func parseJSONLineToFields(line []byte) (fields []string, names []string, err error) { + if len(bytes.TrimSpace(line)) == 0 { + return nil, nil, nil + } + + dec := json.NewDecoder(bytes.NewReader(line)) + dec.UseNumber() + + token, err := dec.Token() + if err != nil { + return nil, nil, fmt.Errorf("invalid JSON: %w", err) + } + + switch t := token.(type) { + case json.Delim: + switch t { + case '[': + fields, err = parseJSONArrayFields(dec) + return fields, nil, err + case '{': + return parseJSONObjectFields(dec) + default: + return nil, nil, fmt.Errorf("unexpected JSON delimiter %q", t) + } + case nil: // JSON null + return []string{""}, nil, nil + case bool: + if t { + return []string{"1"}, nil, nil + } + return []string{"0"}, nil, nil + case json.Number: + return []string{t.String()}, nil, nil + case string: + return []string{t}, nil, nil + default: + return nil, nil, fmt.Errorf("unexpected JSON token type %T", token) + } +} + +func parseJSONArrayFields(dec *json.Decoder) (fields []string, err error) { + for dec.More() { + var raw json.RawMessage + if err := dec.Decode(&raw); err != nil { + return nil, err + } + fields = append(fields, jsonRawToString(raw)) + } + // consume the closing ']' + if _, err := dec.Token(); err != nil { + return nil, err + } + return fields, nil +} + +func parseJSONObjectFields(dec *json.Decoder) (fields []string, names []string, err error) { + for dec.More() { + // Read the object key + keyToken, err := dec.Token() + if err != nil { + return nil, nil, err + } + key, ok := keyToken.(string) + if !ok { + return nil, nil, fmt.Errorf("expected string key in JSON object, got %T", keyToken) + } + + // Read the value as raw JSON to preserve document order and type + var raw json.RawMessage + if err := dec.Decode(&raw); err != nil { + return nil, nil, err + } + + fields = append(fields, jsonRawToString(raw)) + names = append(names, key) + } + // consume the closing '}' + if _, err := dec.Token(); err != nil { + return nil, nil, err + } + return fields, names, nil +} + +// jsonRawToValue converts a raw JSON value to an AWK value: +// - null → numStr("") +// - true → numStr("1") +// - false → numStr("0") +// - number → numStr() +// - string → numStr() +// - array/obj → numStr() +func jsonRawToValue(raw json.RawMessage) value { + raw = bytes.TrimSpace(raw) + if len(raw) == 0 { + return numStr("") + } + switch raw[0] { + case 'n': // null + return numStr("") + case 't': // true + return numStr("1") + case 'f': // false + return numStr("0") + case '"': // string + var s string + if err := json.Unmarshal(raw, &s); err == nil { + return numStr(s) + } + return numStr("") + case '[', '{': // array or object – return JSON representation + return numStr(string(raw)) + default: // number + var n json.Number + if err := json.Unmarshal(raw, &n); err == nil { + return numStr(n.String()) + } + return numStr("") + } +} + +// jsonRawToString returns the AWK string representation of a raw JSON value. +func jsonRawToString(raw json.RawMessage) string { + return jsonRawToValue(raw).s +} From 88d46edd1762e5f3b249c879d767a2f99b66ee31 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Apr 2026 20:21:01 +0000 Subject: [PATCH 3/4] Address code review feedback: improve comments, optimize jsonRawToString, better error message Agent-Logs-Url: https://github.com/benhoyt/goawk/sessions/29bc6e5b-0551-41a0-83cf-9d679344b4d0 Co-authored-by: benhoyt <999033+benhoyt@users.noreply.github.com> --- interp/interp.go | 2 +- interp/jsonl.go | 32 +++++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/interp/interp.go b/interp/interp.go index 50ed53e7..84fd3050 100644 --- a/interp/interp.go +++ b/interp/interp.go @@ -470,7 +470,7 @@ func (p *interp) setExecuteConfig(config *Config) error { } case JSONLMode: if p.csvInputConfig != (CSVInputConfig{}) { - return newError("input mode configuration not valid in jsonl input mode") + return newError("CSV input configuration options are not supported in JSONL mode") } case DefaultMode: if p.csvInputConfig != (CSVInputConfig{}) { diff --git a/interp/jsonl.go b/interp/jsonl.go index 8c93699f..2cf25d8f 100644 --- a/interp/jsonl.go +++ b/interp/jsonl.go @@ -67,7 +67,7 @@ func (s *jsonlSplitter) scan(data []byte, atEOF bool) (advance int, token []byte return advance, line, nil } -// parseJSONLine parses a JSON line and populates p.fields and p.fieldNames. +// parseJSONLine calls parseJSONLineToFields and updates p.fields and p.fieldNames. // Called by ensureFields() when $0 is reassigned in JSONL mode. func (p *interp) parseJSONLine(line string) error { fields, names, err := parseJSONLineToFields([]byte(line)) @@ -200,7 +200,33 @@ func jsonRawToValue(raw json.RawMessage) value { } } -// jsonRawToString returns the AWK string representation of a raw JSON value. +// jsonRawToString returns the AWK string representation of a raw JSON value, +// without allocating an intermediate value struct. func jsonRawToString(raw json.RawMessage) string { - return jsonRawToValue(raw).s + raw = bytes.TrimSpace(raw) + if len(raw) == 0 { + return "" + } + switch raw[0] { + case 'n': // null + return "" + case 't': // true + return "1" + case 'f': // false + return "0" + case '"': // string + var s string + if err := json.Unmarshal(raw, &s); err == nil { + return s + } + return "" + case '[', '{': // array or object – return JSON representation + return string(raw) + default: // number + var n json.Number + if err := json.Unmarshal(raw, &n); err == nil { + return n.String() + } + return "" + } } From 9612689d2b26b38d82522538aa1e824512739942 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Apr 2026 21:00:02 +0000 Subject: [PATCH 4/4] Flatten nested JSON objects/arrays using dot notation for @ fields Agent-Logs-Url: https://github.com/benhoyt/goawk/sessions/c8ca21c9-3654-4775-b9ab-54cdd0583883 Co-authored-by: benhoyt <999033+benhoyt@users.noreply.github.com> --- interp/interp_test.go | 27 ++++++++- interp/jsonl.go | 126 ++++++++++++++++++++++++++---------------- 2 files changed, 103 insertions(+), 50 deletions(-) diff --git a/interp/interp_test.go b/interp/interp_test.go index 7cc00f49..b0b5d7d4 100644 --- a/interp/interp_test.go +++ b/interp/interp_test.go @@ -2182,10 +2182,33 @@ var jsonlTests = []csvTest{ `{"name":"Bob","age":42}`, `{"name":"Bob","age":42}` + "\n", "", nil}, - // Nested objects/arrays are returned as JSON strings + // Nested objects/arrays are flattened with dot notation + {`BEGIN { INPUTMODE="jsonl" } { print @"arr.0", @"arr.1", @"arr.2" }`, + `{"arr":[1,2,3]}`, + "1 2 3\n", "", nil}, + {`BEGIN { INPUTMODE="jsonl" } { print @"obj.x" }`, + `{"obj":{"x":1}}`, + "1\n", "", nil}, + + // Flattened keys are not accessible via the unflattened parent name {`BEGIN { INPUTMODE="jsonl" } { print @"arr", @"obj" }`, `{"arr":[1,2,3],"obj":{"x":1}}`, - "[1,2,3] {\"x\":1}\n", "", nil}, + " \n", "", nil}, + + // Deeply nested: @"a.b.c" and @"a.b.d.0" + {`BEGIN { INPUTMODE="jsonl" } { print @"a.b.c", @"a.b.d.0", @"a.b.d.1" }`, + `{"a":{"b":{"c":"hello","d":[10,20]}}}`, + "hello 10 20\n", "", nil}, + + // Flattened NF counts all scalar leaves + {`BEGIN { INPUTMODE="jsonl" } { print NF }`, + `{"a":1,"b":{"c":2,"d":3},"e":[4,5]}`, + "5\n", "", nil}, + + // FIELDS array contains flattened key paths + {`BEGIN { INPUTMODE="jsonl" } { for (i=1; i<=NF; i++) printf "%s=%s\n", FIELDS[i], $i }`, + `{"x":1,"y":{"z":2}}`, + "x=1\ny.z=2\n", "", nil}, // JSON objects: different keys per line (each line independent) {`BEGIN { INPUTMODE="jsonl" } { print @"a", @"b", @"c" }`, diff --git a/interp/jsonl.go b/interp/jsonl.go index 2cf25d8f..8a517601 100644 --- a/interp/jsonl.go +++ b/interp/jsonl.go @@ -6,6 +6,7 @@ import ( "bytes" "encoding/json" "fmt" + "strconv" ) // jsonlSplitter is a bufio.Scanner split function for JSON Lines input. @@ -80,7 +81,10 @@ func (p *interp) parseJSONLine(line string) error { } // parseJSONLineToFields parses a JSON line and returns the field values and -// (for objects) the field names. For arrays, names is nil. +// (for objects) the field names. For JSON objects, nested structures are +// flattened using dot notation: object keys use @"parent.child" and array +// indexes use @"parent.0", @"parent.1", etc. +// For top-level JSON arrays, names is nil and elements map to $1, $2, etc. func parseJSONLineToFields(line []byte) (fields []string, names []string, err error) { if len(bytes.TrimSpace(line)) == 0 { return nil, nil, nil @@ -101,7 +105,10 @@ func parseJSONLineToFields(line []byte) (fields []string, names []string, err er fields, err = parseJSONArrayFields(dec) return fields, nil, err case '{': - return parseJSONObjectFields(dec) + if err := flattenObject(dec, "", &fields, &names); err != nil { + return nil, nil, err + } + return fields, names, nil default: return nil, nil, fmt.Errorf("unexpected JSON delimiter %q", t) } @@ -121,6 +128,9 @@ func parseJSONLineToFields(line []byte) (fields []string, names []string, err er } } +// parseJSONArrayFields reads JSON array elements ('{' already consumed) and +// returns them as positional fields. Non-scalar elements are returned as their +// JSON string representation. func parseJSONArrayFields(dec *json.Decoder) (fields []string, err error) { for dec.More() { var raw json.RawMessage @@ -136,72 +146,92 @@ func parseJSONArrayFields(dec *json.Decoder) (fields []string, err error) { return fields, nil } -func parseJSONObjectFields(dec *json.Decoder) (fields []string, names []string, err error) { +// flattenJSONValue recursively flattens a raw JSON value into fields/names +// using dot notation for objects and numeric indexes for arrays. +// path is the dot-separated key path so far (empty at the top level). +func flattenJSONValue(raw json.RawMessage, path string, fields *[]string, names *[]string) error { + raw = bytes.TrimSpace(raw) + if len(raw) == 0 { + return nil + } + switch raw[0] { + case '{': + dec := json.NewDecoder(bytes.NewReader(raw)) + dec.UseNumber() + if _, err := dec.Token(); err != nil { // consume '{' + return err + } + return flattenObject(dec, path, fields, names) + case '[': + dec := json.NewDecoder(bytes.NewReader(raw)) + dec.UseNumber() + if _, err := dec.Token(); err != nil { // consume '[' + return err + } + return flattenArray(dec, path, fields, names) + default: + // Scalar value: add to fields with its path as the name. + *fields = append(*fields, jsonRawToString(raw)) + *names = append(*names, path) + return nil + } +} + +// flattenObject processes a JSON object ('{' already consumed) and flattens +// its key-value pairs into fields/names using dot-notation paths. +func flattenObject(dec *json.Decoder, prefix string, fields *[]string, names *[]string) error { for dec.More() { - // Read the object key keyToken, err := dec.Token() if err != nil { - return nil, nil, err + return err } key, ok := keyToken.(string) if !ok { - return nil, nil, fmt.Errorf("expected string key in JSON object, got %T", keyToken) + return fmt.Errorf("expected string key in JSON object, got %T", keyToken) + } + + var path string + if prefix == "" { + path = key + } else { + path = prefix + "." + key } - // Read the value as raw JSON to preserve document order and type var raw json.RawMessage if err := dec.Decode(&raw); err != nil { - return nil, nil, err + return err } - fields = append(fields, jsonRawToString(raw)) - names = append(names, key) - } - // consume the closing '}' - if _, err := dec.Token(); err != nil { - return nil, nil, err + if err := flattenJSONValue(raw, path, fields, names); err != nil { + return err + } } - return fields, names, nil + _, err := dec.Token() // consume '}' + return err } -// jsonRawToValue converts a raw JSON value to an AWK value: -// - null → numStr("") -// - true → numStr("1") -// - false → numStr("0") -// - number → numStr() -// - string → numStr() -// - array/obj → numStr() -func jsonRawToValue(raw json.RawMessage) value { - raw = bytes.TrimSpace(raw) - if len(raw) == 0 { - return numStr("") - } - switch raw[0] { - case 'n': // null - return numStr("") - case 't': // true - return numStr("1") - case 'f': // false - return numStr("0") - case '"': // string - var s string - if err := json.Unmarshal(raw, &s); err == nil { - return numStr(s) +// flattenArray processes a JSON array ('[' already consumed) and flattens +// its elements into fields/names using numeric-index paths (prefix.0, prefix.1, ...). +func flattenArray(dec *json.Decoder, prefix string, fields *[]string, names *[]string) error { + i := 0 + for dec.More() { + var raw json.RawMessage + if err := dec.Decode(&raw); err != nil { + return err } - return numStr("") - case '[', '{': // array or object – return JSON representation - return numStr(string(raw)) - default: // number - var n json.Number - if err := json.Unmarshal(raw, &n); err == nil { - return numStr(n.String()) + path := prefix + "." + strconv.Itoa(i) + if err := flattenJSONValue(raw, path, fields, names); err != nil { + return err } - return numStr("") + i++ } + _, err := dec.Token() // consume ']' + return err } -// jsonRawToString returns the AWK string representation of a raw JSON value, -// without allocating an intermediate value struct. +// jsonRawToString returns the AWK string representation of a scalar JSON value. +// For non-scalar values (arrays and objects), the raw JSON is returned as-is +// (used when a top-level JSON array contains nested structures). func jsonRawToString(raw json.RawMessage) string { raw = bytes.TrimSpace(raw) if len(raw) == 0 {