diff --git a/goawk.go b/goawk.go index 3535692..7deae35 100644 --- a/goawk.go +++ b/goawk.go @@ -61,8 +61,10 @@ Additional GoAWK features: -E progfile load program, treat as last option, disable var=value args -H parse header row and enable @"field" in CSV input mode -h, --help show this help message - -i mode parse input into fields using CSV format (ignore FS and RS) + -i mode parse input into fields using CSV or JSON Lines format + (ignore FS and RS) 'csv|tsv [separator=] [comment=] [header]' + 'jsonl' (JSON Lines: arrays map to $1/$2/..., objects to @"name") -o mode use CSV output for print with args (ignore OFS and ORS) 'csv|tsv [separator=]' -N mode newline output translation: smart (default), raw, crlf diff --git a/interp/interp.go b/interp/interp.go index 121ff9b..84fd305 100644 --- a/interp/interp.go +++ b/interp/interp.go @@ -276,8 +276,13 @@ type Config struct { // "encoding/csv" package, but FieldsPerRecord is not supported, // LazyQuotes is always on, and TrimLeadingSpace is always off. // - // You can also enable CSV or TSV input mode by setting INPUTMODE to "csv" - // or "tsv" in Vars or in the BEGIN block (those override this setting). + // If set to JSONLMode, each input line is parsed as a JSON value. JSON + // arrays map elements to $1, $2, etc. JSON objects map keys to named + // fields accessible via @"name" as well as $1, $2, etc. + // + // You can also enable CSV, TSV, or JSONL input mode by setting INPUTMODE + // to "csv", "tsv", or "jsonl" in Vars or in the BEGIN block (those + // override this setting). // // For further documentation about GoAWK's CSV support, see the full docs // in "../docs/csv.md". @@ -336,6 +341,12 @@ const ( // TSVMode uses tab-separated value mode for input or output. TSVMode IOMode = 2 + + // JSONLMode uses JSON Lines format for input. Each line must be a JSON + // value (typically an array or object). JSON arrays map elements to $1, + // $2, etc. JSON objects map keys to named fields accessible via @"name" + // as well as $1, $2, etc. (in document key order). + JSONLMode IOMode = 3 ) // CSVInputConfig holds additional configuration for when InputMode is CSVMode @@ -457,6 +468,10 @@ func (p *interp) setExecuteConfig(config *Config) error { if p.csvInputConfig.Separator == 0 { p.csvInputConfig.Separator = '\t' } + case JSONLMode: + if p.csvInputConfig != (CSVInputConfig{}) { + return newError("CSV input configuration options are not supported in JSONL mode") + } case DefaultMode: if p.csvInputConfig != (CSVInputConfig{}) { return newError("input mode configuration not valid in default input mode") @@ -957,11 +972,19 @@ func (p *interp) getField(index int) value { } } -// Get the value of a field by name (for CSV/TSV mode), as in @"name". +// Get the value of a field by name (for CSV/TSV/JSONL mode), as in @"name". func (p *interp) getFieldByName(name string) (value, error) { + if p.inputMode == JSONLMode { + // In JSONL mode, we must ensure fields are parsed (per-record field + // names come from each JSON object, not a fixed header). + p.ensureFields() + } if p.fieldIndexes == nil { // Lazily create map of field names to indexes. if p.fieldNames == nil { + if p.inputMode == JSONLMode { + return null(), newError(`no field names for @"name" in JSONL mode; current record is not a JSON object`) + } return null(), newError(`no field names for @; use -H or add "header" to INPUTMODE, and use "getline" first if in BEGIN`) } p.fieldIndexes = make(map[string]int, len(p.fieldNames)) @@ -1058,6 +1081,8 @@ func inputModeString(mode IOMode, csvConfig CSVInputConfig) string { case TSVMode: s = "tsv" defaultSep = '\t' + case JSONLMode: + return "jsonl" case DefaultMode: return "" } @@ -1085,6 +1110,12 @@ func parseInputMode(s string) (mode IOMode, csvConfig CSVInputConfig, err error) case "tsv": mode = TSVMode csvConfig.Separator = '\t' + case "jsonl": + mode = JSONLMode + if len(fields) > 1 { + return DefaultMode, CSVInputConfig{}, newError("jsonl input mode takes no options") + } + return mode, CSVInputConfig{}, nil default: return DefaultMode, CSVInputConfig{}, newError("invalid input mode %q", fields[0]) } diff --git a/interp/interp_test.go b/interp/interp_test.go index a511525..b0b5d7d 100644 --- a/interp/interp_test.go +++ b/interp/interp_test.go @@ -2135,6 +2135,163 @@ BEGIN { {`BEGIN { x="a"; @x += "y" }`, "", "", "parse error at 1:19: assigning @ expression not supported", nil}, } +// JSON Lines (JSONL) test cases +var jsonlTests = []csvTest{ + // JSON arrays: elements map to $1, $2, etc. + {`BEGIN { INPUTMODE="jsonl" } { print $1, $2, $3 }`, + `["Bob", "Smith", 42]` + "\n" + `["Jane", "Brown", 37]`, + "Bob Smith 42\nJane Brown 37\n", "", nil}, + + // JSON boolean and null conversions + {`BEGIN { INPUTMODE="jsonl" } { print $1, $2, $3, $4 }`, + `[true, false, null, 3.14]`, + "1 0 3.14\n", "", nil}, + + // JSON objects: @"name" syntax + {`BEGIN { INPUTMODE="jsonl" } { print @"name", @"age" }`, + `{"name":"Bob","age":42}` + "\n" + `{"name":"Jane","age":37}`, + "Bob 42\nJane 37\n", "", nil}, + + // JSON objects: dynamic @x lookup + {`BEGIN { INPUTMODE="jsonl" } { x="name"; print @x, @"age" }`, + `{"name":"Alice","age":25}`, + "Alice 25\n", "", nil}, + + // JSON objects: missing key returns empty string + {`BEGIN { INPUTMODE="jsonl" } { print @"name", @"missing" }`, + `{"name":"Bob","age":42}`, + "Bob \n", "", nil}, + + // JSON objects: $1, $2, ... also work (in document order) + {`BEGIN { INPUTMODE="jsonl" } { print $1, $2 }`, + `{"name":"Bob","age":42}`, + "Bob 42\n", "", nil}, + + // JSON objects: FIELDS array is updated per record + {`BEGIN { INPUTMODE="jsonl" } { print FIELDS[1], FIELDS[2] }`, + `{"name":"Bob","age":42}` + "\n" + `{"city":"NY","zip":"10001"}`, + "name age\ncity zip\n", "", nil}, + + // JSON objects: NF works + {`BEGIN { INPUTMODE="jsonl" } { print NF }`, + `{"a":1,"b":2,"c":3}`, + "3\n", "", nil}, + + // JSON objects: $0 is the raw JSON line + {`BEGIN { INPUTMODE="jsonl" } { print $0 }`, + `{"name":"Bob","age":42}`, + `{"name":"Bob","age":42}` + "\n", "", nil}, + + // Nested objects/arrays are flattened with dot notation + {`BEGIN { INPUTMODE="jsonl" } { print @"arr.0", @"arr.1", @"arr.2" }`, + `{"arr":[1,2,3]}`, + "1 2 3\n", "", nil}, + {`BEGIN { INPUTMODE="jsonl" } { print @"obj.x" }`, + `{"obj":{"x":1}}`, + "1\n", "", nil}, + + // Flattened keys are not accessible via the unflattened parent name + {`BEGIN { INPUTMODE="jsonl" } { print @"arr", @"obj" }`, + `{"arr":[1,2,3],"obj":{"x":1}}`, + " \n", "", nil}, + + // Deeply nested: @"a.b.c" and @"a.b.d.0" + {`BEGIN { INPUTMODE="jsonl" } { print @"a.b.c", @"a.b.d.0", @"a.b.d.1" }`, + `{"a":{"b":{"c":"hello","d":[10,20]}}}`, + "hello 10 20\n", "", nil}, + + // Flattened NF counts all scalar leaves + {`BEGIN { INPUTMODE="jsonl" } { print NF }`, + `{"a":1,"b":{"c":2,"d":3},"e":[4,5]}`, + "5\n", "", nil}, + + // FIELDS array contains flattened key paths + {`BEGIN { INPUTMODE="jsonl" } { for (i=1; i<=NF; i++) printf "%s=%s\n", FIELDS[i], $i }`, + `{"x":1,"y":{"z":2}}`, + "x=1\ny.z=2\n", "", nil}, + + // JSON objects: different keys per line (each line independent) + {`BEGIN { INPUTMODE="jsonl" } { print @"a", @"b", @"c" }`, + `{"a":"x","b":"y"}` + "\n" + `{"b":"p","c":"q"}`, + "x y \n p q\n", "", nil}, + + // INPUTMODE "jsonl" round-trips via INPUTMODE variable + {`BEGIN { INPUTMODE="jsonl"; print INPUTMODE }`, "", "jsonl\n", "", nil}, + + // NR and FNR work correctly in JSONL mode + {`BEGIN { INPUTMODE="jsonl" } { print NR, $1 }`, + `["a"]` + "\n" + `["b"]` + "\n" + `["c"]`, + "1 a\n2 b\n3 c\n", "", nil}, + + // Filtering works in JSONL mode + {`BEGIN { INPUTMODE="jsonl" } @"type"=="error" { print @"msg" }`, + `{"type":"info","msg":"ok"}` + "\n" + `{"type":"error","msg":"fail"}`, + "fail\n", "", nil}, + + // @"name" on a JSON array line returns an error + {`BEGIN { INPUTMODE="jsonl" } { print @"x" }`, + `["a","b"]`, + "", `no field names for @"name" in JSONL mode; current record is not a JSON object`, nil}, + + // Configure via interp.Config struct + {`{ print $1, $2 }`, `["hello","world"]`, "hello world\n", "", func(config *interp.Config) { + config.InputMode = interp.JSONLMode + }}, + {`{ print @"k", @"v" }`, `{"k":"key","v":"val"}`, "key val\n", "", func(config *interp.Config) { + config.InputMode = interp.JSONLMode + }}, + + // $0 reassignment in JSONL mode re-parses as JSON + {`BEGIN { INPUTMODE="jsonl" } { $0 = "{\"x\":99}"; print @"x" }`, + `{"x":1}`, + "99\n", "", nil}, + + // NF works correctly for JSON arrays + {`BEGIN { INPUTMODE="jsonl" } { print NF }`, + `["a","b","c","d"]`, + "4\n", "", nil}, + + // Empty lines are skipped in JSONL mode + {`BEGIN { INPUTMODE="jsonl" } { print NR, $1 }`, + "\n" + `["a"]` + "\n\n" + `["b"]` + "\n", + "1 a\n2 b\n", "", nil}, + + // Unicode strings work in JSONL mode + {`BEGIN { INPUTMODE="jsonl" } { print @"name" }`, + `{"name":"日本語"}`, + "日本語\n", "", nil}, + + // JSON strings with escape sequences are unescaped + {`BEGIN { INPUTMODE="jsonl" } { print @"s" }`, + `{"s":"hello\nworld"}`, + "hello\nworld\n", "", nil}, + + // Arithmetic on JSON numbers + {`BEGIN { INPUTMODE="jsonl" } { print @"a" + @"b" }`, + `{"a":10,"b":32}`, + "42\n", "", nil}, + + // Boolean comparison on true/false values + {`BEGIN { INPUTMODE="jsonl" } { print ($1 == 1), ($2 == 0) }`, + `[true, false]`, + "1 1\n", "", nil}, + + // INPUTMODE "jsonl" option parsing error + {`BEGIN { INPUTMODE="jsonl foo" }`, "", "", `jsonl input mode takes no options`, nil}, +} + +func TestJSONL(t *testing.T) { + for _, test := range jsonlTests { + testName := test.src + if len(testName) > 70 { + testName = testName[:70] + } + t.Run(testName, func(t *testing.T) { + testGoAWK(t, test.src, test.in, test.out, test.err, nil, test.configure) + }) + } +} + func TestCSV(t *testing.T) { for _, test := range csvTests { testName := test.src diff --git a/interp/io.go b/interp/io.go index 72e9af5..64dc9b2 100644 --- a/interp/io.go +++ b/interp/io.go @@ -255,6 +255,13 @@ func (p *interp) newScanner(input io.Reader, buffer []byte) *bufio.Scanner { setFieldNames: p.setFieldNames, } scanner.Split(splitter.scan) + case p.inputMode == JSONLMode: + splitter := &jsonlSplitter{ + fields: &p.fields, + setFieldNames: p.setFieldNames, + interp: p, + } + scanner.Split(splitter.scan) case p.recordSep == "\n": // Scanner default is to split on newlines case p.recordSep == "": @@ -274,7 +281,8 @@ func (p *interp) newScanner(input io.Reader, buffer []byte) *bufio.Scanner { } // setFieldNames is called by csvSplitter.scan on the first row (if the -// "header" option is specified). +// "header" option is specified), and by parseJSONLine for each JSON object +// record. If names is nil, field names are cleared. func (p *interp) setFieldNames(names []string) { p.fieldNames = names p.fieldIndexes = nil // clear name-to-index cache @@ -685,6 +693,16 @@ func (p *interp) ensureFields() { p.fields = nil } } + case p.inputMode == JSONLMode: + // Normally fields have already been parsed by jsonlSplitter. + // Only re-parse if $0 was explicitly assigned (reparseCSV flag). + if p.reparseCSV { + if err := p.parseJSONLine(p.line); err != nil { + fmt.Fprintf(p.errorOutput, "goawk: %s\n", err) + p.fields = nil + p.setFieldNames(nil) + } + } case p.savedFieldSep == " ": // FS space (default) means split fields on any whitespace p.fields = strings.Fields(p.line) diff --git a/interp/jsonl.go b/interp/jsonl.go new file mode 100644 index 0000000..8a51760 --- /dev/null +++ b/interp/jsonl.go @@ -0,0 +1,262 @@ +// JSON Lines input parsing for GoAWK interpreter. + +package interp + +import ( + "bytes" + "encoding/json" + "fmt" + "strconv" +) + +// jsonlSplitter is a bufio.Scanner split function for JSON Lines input. +// It splits on newlines, skipping empty lines, and pre-parses each JSON line +// into fields (like csvSplitter does for CSV). This ensures that FIELDS and +// other per-record state are populated before each action body runs. +type jsonlSplitter struct { + fields *[]string + setFieldNames func(names []string) + interp *interp // for error reporting +} + +func (s *jsonlSplitter) scan(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + // Find and skip empty lines; stop at first non-empty line. + skip := 0 + var line []byte + for { + newline := bytes.IndexByte(data, '\n') + var lineEnd int + if newline >= 0 { + lineEnd = newline + 1 + } else if atEOF { + lineEnd = len(data) + } else { + return 0, nil, nil // need more data + } + + candidate := dropCR(data[:lineEnd-lenNewline(data[:lineEnd])]) + if len(bytes.TrimSpace(candidate)) > 0 { + line = candidate + advance += lineEnd + break + } + // Empty line: skip it + advance += lineEnd + skip += lineEnd + data = data[lineEnd:] + if atEOF && len(data) == 0 { + return advance, nil, nil + } + } + + // Parse the JSON line and populate fields / field names. + fields, names, parseErr := parseJSONLineToFields(line) + if parseErr != nil { + if s.interp != nil { + fmt.Fprintf(s.interp.errorOutput, "goawk: %s\n", parseErr) + } + fields = nil + names = nil + } + *s.fields = fields + s.setFieldNames(names) + + return advance, line, nil +} + +// parseJSONLine calls parseJSONLineToFields and updates p.fields and p.fieldNames. +// Called by ensureFields() when $0 is reassigned in JSONL mode. +func (p *interp) parseJSONLine(line string) error { + fields, names, err := parseJSONLineToFields([]byte(line)) + if err != nil { + return err + } + p.fields = fields + p.setFieldNames(names) + return nil +} + +// parseJSONLineToFields parses a JSON line and returns the field values and +// (for objects) the field names. For JSON objects, nested structures are +// flattened using dot notation: object keys use @"parent.child" and array +// indexes use @"parent.0", @"parent.1", etc. +// For top-level JSON arrays, names is nil and elements map to $1, $2, etc. +func parseJSONLineToFields(line []byte) (fields []string, names []string, err error) { + if len(bytes.TrimSpace(line)) == 0 { + return nil, nil, nil + } + + dec := json.NewDecoder(bytes.NewReader(line)) + dec.UseNumber() + + token, err := dec.Token() + if err != nil { + return nil, nil, fmt.Errorf("invalid JSON: %w", err) + } + + switch t := token.(type) { + case json.Delim: + switch t { + case '[': + fields, err = parseJSONArrayFields(dec) + return fields, nil, err + case '{': + if err := flattenObject(dec, "", &fields, &names); err != nil { + return nil, nil, err + } + return fields, names, nil + default: + return nil, nil, fmt.Errorf("unexpected JSON delimiter %q", t) + } + case nil: // JSON null + return []string{""}, nil, nil + case bool: + if t { + return []string{"1"}, nil, nil + } + return []string{"0"}, nil, nil + case json.Number: + return []string{t.String()}, nil, nil + case string: + return []string{t}, nil, nil + default: + return nil, nil, fmt.Errorf("unexpected JSON token type %T", token) + } +} + +// parseJSONArrayFields reads JSON array elements ('{' already consumed) and +// returns them as positional fields. Non-scalar elements are returned as their +// JSON string representation. +func parseJSONArrayFields(dec *json.Decoder) (fields []string, err error) { + for dec.More() { + var raw json.RawMessage + if err := dec.Decode(&raw); err != nil { + return nil, err + } + fields = append(fields, jsonRawToString(raw)) + } + // consume the closing ']' + if _, err := dec.Token(); err != nil { + return nil, err + } + return fields, nil +} + +// flattenJSONValue recursively flattens a raw JSON value into fields/names +// using dot notation for objects and numeric indexes for arrays. +// path is the dot-separated key path so far (empty at the top level). +func flattenJSONValue(raw json.RawMessage, path string, fields *[]string, names *[]string) error { + raw = bytes.TrimSpace(raw) + if len(raw) == 0 { + return nil + } + switch raw[0] { + case '{': + dec := json.NewDecoder(bytes.NewReader(raw)) + dec.UseNumber() + if _, err := dec.Token(); err != nil { // consume '{' + return err + } + return flattenObject(dec, path, fields, names) + case '[': + dec := json.NewDecoder(bytes.NewReader(raw)) + dec.UseNumber() + if _, err := dec.Token(); err != nil { // consume '[' + return err + } + return flattenArray(dec, path, fields, names) + default: + // Scalar value: add to fields with its path as the name. + *fields = append(*fields, jsonRawToString(raw)) + *names = append(*names, path) + return nil + } +} + +// flattenObject processes a JSON object ('{' already consumed) and flattens +// its key-value pairs into fields/names using dot-notation paths. +func flattenObject(dec *json.Decoder, prefix string, fields *[]string, names *[]string) error { + for dec.More() { + keyToken, err := dec.Token() + if err != nil { + return err + } + key, ok := keyToken.(string) + if !ok { + return fmt.Errorf("expected string key in JSON object, got %T", keyToken) + } + + var path string + if prefix == "" { + path = key + } else { + path = prefix + "." + key + } + + var raw json.RawMessage + if err := dec.Decode(&raw); err != nil { + return err + } + + if err := flattenJSONValue(raw, path, fields, names); err != nil { + return err + } + } + _, err := dec.Token() // consume '}' + return err +} + +// flattenArray processes a JSON array ('[' already consumed) and flattens +// its elements into fields/names using numeric-index paths (prefix.0, prefix.1, ...). +func flattenArray(dec *json.Decoder, prefix string, fields *[]string, names *[]string) error { + i := 0 + for dec.More() { + var raw json.RawMessage + if err := dec.Decode(&raw); err != nil { + return err + } + path := prefix + "." + strconv.Itoa(i) + if err := flattenJSONValue(raw, path, fields, names); err != nil { + return err + } + i++ + } + _, err := dec.Token() // consume ']' + return err +} + +// jsonRawToString returns the AWK string representation of a scalar JSON value. +// For non-scalar values (arrays and objects), the raw JSON is returned as-is +// (used when a top-level JSON array contains nested structures). +func jsonRawToString(raw json.RawMessage) string { + raw = bytes.TrimSpace(raw) + if len(raw) == 0 { + return "" + } + switch raw[0] { + case 'n': // null + return "" + case 't': // true + return "1" + case 'f': // false + return "0" + case '"': // string + var s string + if err := json.Unmarshal(raw, &s); err == nil { + return s + } + return "" + case '[', '{': // array or object – return JSON representation + return string(raw) + default: // number + var n json.Number + if err := json.Unmarshal(raw, &n); err == nil { + return n.String() + } + return "" + } +}