From e3e2581f924ecdc80fefc0521b9d9d3edb01b568 Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Tue, 3 Jan 2023 13:16:29 +0100 Subject: [PATCH] cvs: Add more from/to options Add header option, on by default Disable comments by default Rename comma to delimiter Add quote_char option Add skip initial space option Uses a forked version of std csv to support custom quote character See top of csv.go for TODOs --- format/csv/csv.go | 178 +++++++-- format/format.go | 8 +- internal/csvex/README.md | 1 + internal/csvex/reader.go | 474 ++++++++++++++++++++++++ internal/csvex/reader_test.go | 669 ++++++++++++++++++++++++++++++++++ internal/csvex/write_test.go | 124 +++++++ internal/csvex/writer.go | 186 ++++++++++ 7 files changed, 1608 insertions(+), 32 deletions(-) create mode 100644 internal/csvex/README.md create mode 100644 internal/csvex/reader.go create mode 100644 internal/csvex/reader_test.go create mode 100644 internal/csvex/write_test.go create mode 100644 internal/csvex/writer.go diff --git a/format/csv/csv.go b/format/csv/csv.go index b86ca7485..fab57b675 100644 --- a/format/csv/csv.go +++ b/format/csv/csv.go @@ -1,14 +1,26 @@ package csv +// TODO: error, throw error always? no decode value with gap etc? -d csv from_csv +// TODO: header row field count mismatch error, csv reader takes care of check atm. can use FieldsPerRecord -1 +// TODO: row object keys mismatch writer +// TODO: lazy quotes? +// TODO: comment in writer? string elements? +// TODO: to_csv objects +// TODO: to_csv opts help +// TODO: go maps are random order, now sorts headers +// TODO: option aliases? +// TODO: snake_case option? + import ( "bytes" "embed" - "encoding/csv" "errors" "fmt" "io" + "sort" "github.com/wader/fq/format" + "github.com/wader/fq/internal/csvex" "github.com/wader/fq/internal/gojqex" "github.com/wader/fq/pkg/bitio" "github.com/wader/fq/pkg/decode" @@ -27,8 +39,11 @@ func init() { ProbeOrder: format.ProbeOrderTextFuzzy, DecodeFn: decodeCSV, DecodeInArg: format.CSVLIn{ - Comma: ",", - Comment: "#", + Delimiter: ",", + Comment: "", + QuoteChar: `"`, + Header: true, + SkipInitialSpace: false, }, Functions: []string{"_todisplay"}, }) @@ -39,17 +54,30 @@ func init() { func decodeCSV(d *decode.D, in any) any { ci, _ := in.(format.CSVLIn) - var rvs []any br := d.RawLen(d.Len()) - r := csv.NewReader(bitio.NewIOReader(br)) - r.TrimLeadingSpace = true + r := csvex.NewReader(bitio.NewIOReader(br)) r.LazyQuotes = true - if ci.Comma != "" { + if ci.Delimiter != "" { + r.Comma = rune(ci.Delimiter[0]) + } else if ci.Comma != "" { r.Comma = rune(ci.Comma[0]) } if ci.Comment != "" { r.Comment = rune(ci.Comment[0]) + } else { + r.Comment = 0 + } + if ci.QuoteChar != "" { + r.Quote = rune(ci.QuoteChar[0]) + } else { + r.Quote = '"' } + r.TrimLeadingSpace = ci.SkipInitialSpace + + row := 1 + var rvs []any + + var headers []string for { r, err := r.Read() if errors.Is(err, io.EOF) { @@ -57,11 +85,28 @@ func decodeCSV(d *decode.D, in any) any { } else if err != nil { return err } - var vs []any - for _, s := range r { - vs = append(vs, s) + + if ci.Header { + if headers == nil { + // TODO: duplicate headers? + headers = append(headers, r...) + } else { + obj := map[string]any{} + for i, s := range r { + h := headers[i] + obj[h] = s + } + rvs = append(rvs, obj) + } + } else { + var vs []any + for _, s := range r { + vs = append(vs, s) + } + rvs = append(rvs, vs) } - rvs = append(rvs, vs) + + row++ } d.Value.V = &scalar.Any{Actual: rvs} @@ -71,35 +116,108 @@ func decodeCSV(d *decode.D, in any) any { } type ToCSVOpts struct { - Comma string + Comma string // alias for Delimiter + Delimiter string + QuoteChar string + Header bool } func toCSV(_ *interp.Interp, c []any, opts ToCSVOpts) any { b := &bytes.Buffer{} - w := csv.NewWriter(b) - if opts.Comma != "" { + w := csvex.NewWriter(b) + if opts.Delimiter != "" { + w.Comma = rune(opts.Delimiter[0]) + } else if opts.Comma != "" { w.Comma = rune(opts.Comma[0]) } + if opts.QuoteChar != "" { + w.Quote = rune(opts.QuoteChar[0]) + } else { + w.Quote = '"' + } + + seenObject := 0 + seenArrays := 0 + var headers []string + for _, row := range c { - rs, ok := gojqex.Cast[[]any](row) - if !ok { - return fmt.Errorf("expected row to be an array, got %s", gojqex.TypeErrorPreview(row)) - } - vs, ok := gojqex.NormalizeToStrings(rs).([]any) - if !ok { - panic("not array") - } - var ss []string - for _, v := range vs { - s, ok := v.(string) + switch row.(type) { + case []any: + if seenObject > 0 { + return fmt.Errorf("mixed row types, expected row to be an object, got %s", gojqex.TypeErrorPreview(row)) + } + + rs, ok := gojqex.Cast[[]any](row) if !ok { - return fmt.Errorf("expected row record to be scalars, got %s", gojqex.TypeErrorPreview(v)) + return fmt.Errorf("expected row to be an array, got %s", gojqex.TypeErrorPreview(row)) } - ss = append(ss, s) - } - if err := w.Write(ss); err != nil { - return err + vs, ok := gojqex.NormalizeToStrings(rs).([]any) + if !ok { + panic("not array") + } + var ss []string + for _, v := range vs { + s, ok := v.(string) + if !ok { + return fmt.Errorf("expected row record to be scalars, got %s", gojqex.TypeErrorPreview(v)) + } + ss = append(ss, s) + } + if err := w.Write(ss); err != nil { + return err + } + + seenArrays++ + case map[string]any: + if seenArrays > 0 { + return fmt.Errorf("mixed row types, expected row to be an array, got %s", gojqex.TypeErrorPreview(row)) + } + + rm, ok := gojqex.Cast[map[string]any](row) + if !ok { + return fmt.Errorf("expected row to be an object, got %s", gojqex.TypeErrorPreview(row)) + } + vm, ok := gojqex.NormalizeToStrings(rm).(map[string]any) + if !ok { + panic("not object") + } + + if headers == nil { + // TODO: maps are random order in go + for k := range vm { + headers = append(headers, k) + } + sort.Strings(headers) + + if err := w.Write(headers); err != nil { + return err + } + } + + var ss []string + keysFound := 0 + for _, k := range headers { + s, ok := vm[k].(string) + if !ok { + return fmt.Errorf("expected row object to have a %q key, %s", k, gojqex.TypeErrorPreview(row)) + } + ss = append(ss, s) + keysFound++ + } + // TODO: what keys are extra/missing + if keysFound < len(headers) { + return fmt.Errorf("expected row object has missing keys %s", gojqex.TypeErrorPreview(row)) + } else if keysFound > len(headers) { + return fmt.Errorf("expected row object has extra keys %s", gojqex.TypeErrorPreview(row)) + } + + if err := w.Write(ss); err != nil { + return err + } + + seenObject++ } + } w.Flush() diff --git a/format/format.go b/format/format.go index 219b23b43..968e47f3b 100644 --- a/format/format.go +++ b/format/format.go @@ -320,8 +320,12 @@ type HTMLIn struct { } type CSVLIn struct { - Comma string `doc:"Separator character"` - Comment string `doc:"Comment line character"` + Comma string `doc:"Alias for Delimiter"` + Delimiter string `doc:"Field delimiter character"` + Comment string `doc:"Comment line character"` + QuoteChar string `doc:"Quote character"` + Header bool `doc:"Convert to objects based on header"` + SkipInitialSpace bool `doc:"Don't include leading whitespace"` } type BitCoinBlockIn struct { diff --git a/internal/csvex/README.md b/internal/csvex/README.md new file mode 100644 index 000000000..7904b6ab7 --- /dev/null +++ b/internal/csvex/README.md @@ -0,0 +1 @@ +Modified version of go std encoding/csv Reader/Writer to support difference quote character diff --git a/internal/csvex/reader.go b/internal/csvex/reader.go new file mode 100644 index 000000000..e71fe4cb2 --- /dev/null +++ b/internal/csvex/reader.go @@ -0,0 +1,474 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package csv reads and writes comma-separated values (CSV) files. +// There are many kinds of CSV files; this package supports the format +// described in RFC 4180. +// +// A csv file contains zero or more records of one or more fields per record. +// Each record is separated by the newline character. The final record may +// optionally be followed by a newline character. +// +// field1,field2,field3 +// +// White space is considered part of a field. +// +// Carriage returns before newline characters are silently removed. +// +// Blank lines are ignored. A line with only whitespace characters (excluding +// the ending newline character) is not considered a blank line. +// +// Fields which start and stop with the quote character " are called +// quoted-fields. The beginning and ending quote are not part of the +// field. +// +// The source: +// +// normal string,"quoted-field" +// +// results in the fields +// +// {`normal string`, `quoted-field`} +// +// Within a quoted-field a quote character followed by a second quote +// character is considered a single quote. +// +// "the ""word"" is true","a ""quoted-field""" +// +// results in +// +// {`the "word" is true`, `a "quoted-field"`} +// +// Newlines and commas may be included in a quoted-field +// +// "Multi-line +// field","comma is ," +// +// results in +// +// {`Multi-line +// field`, `comma is ,`} + +// skip to keep code similar upstream +// +//nolint:errorlint +package csvex + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "unicode" + "unicode/utf8" +) + +// A ParseError is returned for parsing errors. +// Line numbers are 1-indexed and columns are 0-indexed. +type ParseError struct { + StartLine int // Line where the record starts + Line int // Line where the error occurred + Column int // Column (1-based byte index) where the error occurred + Err error // The actual error +} + +func (e *ParseError) Error() string { + if e.Err == ErrFieldCount { + return fmt.Sprintf("record on line %d: %v", e.Line, e.Err) + } + if e.StartLine != e.Line { + return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.StartLine, e.Line, e.Column, e.Err) + } + return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err) +} + +func (e *ParseError) Unwrap() error { return e.Err } + +// These are the errors that can be returned in ParseError.Err. +var ( + ErrBareQuote = errors.New("bare \" in non-quoted-field") + ErrQuote = errors.New("extraneous or missing \" in quoted-field") + ErrFieldCount = errors.New("wrong number of fields") + + // Deprecated: ErrTrailingComma is no longer used. + ErrTrailingComma = errors.New("extra delimiter at end of line") +) + +var errInvalidDelim = errors.New("csv: invalid field or comment delimiter") + +func validDelim(r rune, q rune) bool { + return r != 0 && r != q && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError +} + +// A Reader reads records from a CSV-encoded file. +// +// As returned by NewReader, a Reader expects input conforming to RFC 4180. +// The exported fields can be changed to customize the details before the +// first call to Read or ReadAll. +// +// The Reader converts all \r\n sequences in its input to plain \n, +// including in multiline field values, so that the returned data does +// not depend on which line-ending convention an input file uses. +type Reader struct { + // Comma is the field delimiter. + // It is set to comma (',') by NewReader. + // Comma must be a valid rune and must not be \r, \n, + // or the Unicode replacement character (0xFFFD). + Comma rune + + // Comment, if not 0, is the comment character. Lines beginning with the + // Comment character without preceding whitespace are ignored. + // With leading whitespace the Comment character becomes part of the + // field, even if TrimLeadingSpace is true. + // Comment must be a valid rune and must not be \r, \n, + // or the Unicode replacement character (0xFFFD). + // It must also not be equal to Comma. + Comment rune + + // FieldsPerRecord is the number of expected fields per record. + // If FieldsPerRecord is positive, Read requires each record to + // have the given number of fields. If FieldsPerRecord is 0, Read sets it to + // the number of fields in the first record, so that future records must + // have the same field count. If FieldsPerRecord is negative, no check is + // made and records may have a variable number of fields. + FieldsPerRecord int + + // If LazyQuotes is true, a quote may appear in an unquoted field and a + // non-doubled quote may appear in a quoted field. + LazyQuotes bool + + // If TrimLeadingSpace is true, leading white space in a field is ignored. + // This is done even if the field delimiter, Comma, is white space. + TrimLeadingSpace bool + + // ReuseRecord controls whether calls to Read may return a slice sharing + // the backing array of the previous call's returned slice for performance. + // By default, each call to Read returns newly allocated memory owned by the caller. + ReuseRecord bool + + // Deprecated: TrailingComma is no longer used. + TrailingComma bool + + // Quote character + Quote rune + + r *bufio.Reader + + // numLine is the current line being read in the CSV file. + numLine int + + // offset is the input stream byte offset of the current reader position. + offset int64 + + // rawBuffer is a line buffer only used by the readLine method. + rawBuffer []byte + + // recordBuffer holds the unescaped fields, one after another. + // The fields can be accessed by using the indexes in fieldIndexes. + // E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de` + // and fieldIndexes will contain the indexes [1, 2, 5, 6]. + recordBuffer []byte + + // fieldIndexes is an index of fields inside recordBuffer. + // The i'th field ends at offset fieldIndexes[i] in recordBuffer. + fieldIndexes []int + + // fieldPositions is an index of field positions for the + // last record returned by Read. + fieldPositions []position + + // lastRecord is a record cache and only used when ReuseRecord == true. + lastRecord []string +} + +// NewReader returns a new Reader that reads from r. +func NewReader(r io.Reader) *Reader { + return &Reader{ + Comma: ',', + Quote: '"', + r: bufio.NewReader(r), + } +} + +// Read reads one record (a slice of fields) from r. +// If the record has an unexpected number of fields, +// Read returns the record along with the error ErrFieldCount. +// Except for that case, Read always returns either a non-nil +// record or a non-nil error, but not both. +// If there is no data left to be read, Read returns nil, io.EOF. +// If ReuseRecord is true, the returned slice may be shared +// between multiple calls to Read. +func (r *Reader) Read() (record []string, err error) { + if r.ReuseRecord { + record, err = r.readRecord(r.lastRecord) + r.lastRecord = record + } else { + record, err = r.readRecord(nil) + } + return record, err +} + +// FieldPos returns the line and column corresponding to +// the start of the field with the given index in the slice most recently +// returned by Read. Numbering of lines and columns starts at 1; +// columns are counted in bytes, not runes. +// +// If this is called with an out-of-bounds index, it panics. +func (r *Reader) FieldPos(field int) (line, column int) { + if field < 0 || field >= len(r.fieldPositions) { + panic("out of range index passed to FieldPos") + } + p := &r.fieldPositions[field] + return p.line, p.col +} + +// InputOffset returns the input stream byte offset of the current reader +// position. The offset gives the location of the end of the most recently +// read row and the beginning of the next row. +func (r *Reader) InputOffset() int64 { + return r.offset +} + +// pos holds the position of a field in the current line. +type position struct { + line, col int +} + +// ReadAll reads all the remaining records from r. +// Each record is a slice of fields. +// A successful call returns err == nil, not err == io.EOF. Because ReadAll is +// defined to read until EOF, it does not treat end of file as an error to be +// reported. +func (r *Reader) ReadAll() (records [][]string, err error) { + for { + record, err := r.readRecord(nil) + if err == io.EOF { + return records, nil + } + if err != nil { + return nil, err + } + records = append(records, record) + } +} + +// readLine reads the next line (with the trailing endline). +// If EOF is hit without a trailing endline, it will be omitted. +// If some bytes were read, then the error is never io.EOF. +// The result is only valid until the next call to readLine. +func (r *Reader) readLine() ([]byte, error) { + line, err := r.r.ReadSlice('\n') + if err == bufio.ErrBufferFull { + r.rawBuffer = append(r.rawBuffer[:0], line...) + for err == bufio.ErrBufferFull { + line, err = r.r.ReadSlice('\n') + r.rawBuffer = append(r.rawBuffer, line...) + } + line = r.rawBuffer + } + readSize := len(line) + if readSize > 0 && err == io.EOF { + err = nil + // For backwards compatibility, drop trailing \r before EOF. + if line[readSize-1] == '\r' { + line = line[:readSize-1] + } + } + r.numLine++ + r.offset += int64(readSize) + // Normalize \r\n to \n on all input lines. + if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' { + line[n-2] = '\n' + line = line[:n-1] + } + return line, err +} + +// lengthNL reports the number of bytes for the trailing \n. +func lengthNL(b []byte) int { + if len(b) > 0 && b[len(b)-1] == '\n' { + return 1 + } + return 0 +} + +// nextRune returns the next rune in b or utf8.RuneError. +func nextRune(b []byte) rune { + r, _ := utf8.DecodeRune(b) + return r +} + +func (r *Reader) readRecord(dst []string) ([]string, error) { + if r.Comma == r.Comment || !validDelim(r.Comma, r.Quote) || (r.Comment != 0 && !validDelim(r.Comment, r.Quote)) { + return nil, errInvalidDelim + } + + // Read line (automatically skipping past empty lines and any comments). + var line []byte + var errRead error + for errRead == nil { + line, errRead = r.readLine() + if r.Comment != 0 && nextRune(line) == r.Comment { + line = nil + continue // Skip comment lines + } + if errRead == nil && len(line) == lengthNL(line) { + line = nil + continue // Skip empty lines + } + break + } + if errRead == io.EOF { + return nil, errRead + } + + // Parse each field in the record. + var err error + quoteLen := utf8.RuneLen(r.Quote) + quoteBytes := []byte(string(r.Quote)) + commaLen := utf8.RuneLen(r.Comma) + recLine := r.numLine // Starting line for record + r.recordBuffer = r.recordBuffer[:0] + r.fieldIndexes = r.fieldIndexes[:0] + r.fieldPositions = r.fieldPositions[:0] + pos := position{line: r.numLine, col: 1} +parseField: + for { + if r.TrimLeadingSpace { + i := bytes.IndexFunc(line, func(r rune) bool { + return !unicode.IsSpace(r) + }) + if i < 0 { + i = len(line) + pos.col -= lengthNL(line) + } + line = line[i:] + pos.col += i + } + if len(line) == 0 || !bytes.HasPrefix(line, quoteBytes) { + // Non-quoted string field + i := bytes.IndexRune(line, r.Comma) + field := line + if i >= 0 { + field = field[:i] + } else { + field = field[:len(field)-lengthNL(field)] + } + // Check to make sure a quote does not appear in field. + if !r.LazyQuotes { + if j := bytes.IndexRune(field, r.Quote); j >= 0 { + col := pos.col + j + err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote} + break parseField + } + } + r.recordBuffer = append(r.recordBuffer, field...) + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + r.fieldPositions = append(r.fieldPositions, pos) + if i >= 0 { + line = line[i+commaLen:] + pos.col += i + commaLen + continue parseField + } + break parseField + } else { + // Quoted string field + fieldPos := pos + line = line[quoteLen:] + pos.col += quoteLen + for { + i := bytes.IndexRune(line, r.Quote) + if i >= 0 { + // Hit next quote. + r.recordBuffer = append(r.recordBuffer, line[:i]...) + line = line[i+quoteLen:] + pos.col += i + quoteLen + switch rn := nextRune(line); { + case rn == r.Quote: + // `""` sequence (append quote). + r.recordBuffer = append(r.recordBuffer, quoteBytes...) + line = line[quoteLen:] + pos.col += quoteLen + case rn == r.Comma: + // `",` sequence (end of field). + line = line[commaLen:] + pos.col += commaLen + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + r.fieldPositions = append(r.fieldPositions, fieldPos) + continue parseField + case lengthNL(line) == len(line): + // `"\n` sequence (end of line). + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + r.fieldPositions = append(r.fieldPositions, fieldPos) + break parseField + case r.LazyQuotes: + // `"` sequence (bare quote). + r.recordBuffer = append(r.recordBuffer, quoteBytes...) + default: + // `"*` sequence (invalid non-escaped quote). + err = &ParseError{StartLine: recLine, Line: r.numLine, Column: pos.col - quoteLen, Err: ErrQuote} + break parseField + } + } else if len(line) > 0 { + // Hit end of line (copy all data so far). + r.recordBuffer = append(r.recordBuffer, line...) + if errRead != nil { + break parseField + } + pos.col += len(line) + line, errRead = r.readLine() + if len(line) > 0 { + pos.line++ + pos.col = 1 + } + if errRead == io.EOF { + errRead = nil + } + } else { + // Abrupt end of file (EOF or error). + if !r.LazyQuotes && errRead == nil { + err = &ParseError{StartLine: recLine, Line: pos.line, Column: pos.col, Err: ErrQuote} + break parseField + } + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + r.fieldPositions = append(r.fieldPositions, fieldPos) + break parseField + } + } + } + } + if err == nil { + err = errRead + } + + // Create a single string and create slices out of it. + // This pins the memory of the fields together, but allocates once. + str := string(r.recordBuffer) // Convert to string once to batch allocations + dst = dst[:0] + if cap(dst) < len(r.fieldIndexes) { + dst = make([]string, len(r.fieldIndexes)) + } + dst = dst[:len(r.fieldIndexes)] + var preIdx int + for i, idx := range r.fieldIndexes { + dst[i] = str[preIdx:idx] + preIdx = idx + } + + // Check or update the expected fields per record. + if r.FieldsPerRecord > 0 { + if len(dst) != r.FieldsPerRecord && err == nil { + err = &ParseError{ + StartLine: recLine, + Line: recLine, + Column: 1, + Err: ErrFieldCount, + } + } + } else if r.FieldsPerRecord == 0 { + r.FieldsPerRecord = len(dst) + } + return dst, err +} diff --git a/internal/csvex/reader_test.go b/internal/csvex/reader_test.go new file mode 100644 index 000000000..60100eb1c --- /dev/null +++ b/internal/csvex/reader_test.go @@ -0,0 +1,669 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package csvex + +import ( + "errors" + "fmt" + "io" + "reflect" + "strings" + "testing" + "unicode/utf8" +) + +type readTest struct { + Name string + Input string + Output [][]string + Positions [][][2]int + Errors []error + + // These fields are copied into the Reader + Comma rune + Quote rune + Comment rune + UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1 + FieldsPerRecord int + LazyQuotes bool + TrimLeadingSpace bool + ReuseRecord bool +} + +// In these tests, the §, ¶ and ∑ characters in readTest.Input are used to denote +// the start of a field, a record boundary and the position of an error respectively. +// They are removed before parsing and are used to verify the position +// information reported by FieldPos. + +var readTests = []readTest{{ + Name: "Simple", + Input: "§a,§b,§c\n", + Output: [][]string{{"a", "b", "c"}}, +}, { + Name: "CRLF", + Input: "§a,§b\r\n¶§c,§d\r\n", + Output: [][]string{{"a", "b"}, {"c", "d"}}, +}, { + Name: "BareCR", + Input: "§a,§b\rc,§d\r\n", + Output: [][]string{{"a", "b\rc", "d"}}, +}, { + Name: "RFC4180test", + Input: `§#field1,§field2,§field3 +¶§"aaa",§"bb +b",§"ccc" +¶§"a,a",§"b""bb",§"ccc" +¶§zzz,§yyy,§xxx +`, + Output: [][]string{ + {"#field1", "field2", "field3"}, + {"aaa", "bb\nb", "ccc"}, + {"a,a", `b"bb`, "ccc"}, + {"zzz", "yyy", "xxx"}, + }, + UseFieldsPerRecord: true, + FieldsPerRecord: 0, +}, { + Name: "NoEOLTest", + Input: "§a,§b,§c", + Output: [][]string{{"a", "b", "c"}}, +}, { + Name: "Semicolon", + Input: "§a;§b;§c\n", + Output: [][]string{{"a", "b", "c"}}, + Comma: ';', +}, { + Name: "MultiLine", + Input: `§"two +line",§"one line",§"three +line +field"`, + Output: [][]string{{"two\nline", "one line", "three\nline\nfield"}}, +}, { + Name: "BlankLine", + Input: "§a,§b,§c\n\n¶§d,§e,§f\n\n", + Output: [][]string{ + {"a", "b", "c"}, + {"d", "e", "f"}, + }, +}, { + Name: "BlankLineFieldCount", + Input: "§a,§b,§c\n\n¶§d,§e,§f\n\n", + Output: [][]string{ + {"a", "b", "c"}, + {"d", "e", "f"}, + }, + UseFieldsPerRecord: true, + FieldsPerRecord: 0, +}, { + Name: "TrimSpace", + Input: " §a, §b, §c\n", + Output: [][]string{{"a", "b", "c"}}, + TrimLeadingSpace: true, +}, { + Name: "LeadingSpace", + Input: "§ a,§ b,§ c\n", + Output: [][]string{{" a", " b", " c"}}, +}, { + Name: "Comment", + Input: "#1,2,3\n§a,§b,§c\n#comment", + Output: [][]string{{"a", "b", "c"}}, + Comment: '#', +}, { + Name: "NoComment", + Input: "§#1,§2,§3\n¶§a,§b,§c", + Output: [][]string{{"#1", "2", "3"}, {"a", "b", "c"}}, +}, { + Name: "LazyQuotes", + Input: `§a "word",§"1"2",§a",§"b`, + Output: [][]string{{`a "word"`, `1"2`, `a"`, `b`}}, + LazyQuotes: true, +}, { + Name: "BareQuotes", + Input: `§a "word",§"1"2",§a"`, + Output: [][]string{{`a "word"`, `1"2`, `a"`}}, + LazyQuotes: true, +}, { + Name: "BareDoubleQuotes", + Input: `§a""b,§c`, + Output: [][]string{{`a""b`, `c`}}, + LazyQuotes: true, +}, { + Name: "BadDoubleQuotes", + Input: `§a∑""b,c`, + Errors: []error{&ParseError{Err: ErrBareQuote}}, +}, { + Name: "TrimQuote", + Input: ` §"a",§" b",§c`, + Output: [][]string{{"a", " b", "c"}}, + TrimLeadingSpace: true, +}, { + Name: "BadBareQuote", + Input: `§a ∑"word","b"`, + Errors: []error{&ParseError{Err: ErrBareQuote}}, +}, { + Name: "BadTrailingQuote", + Input: `§"a word",b∑"`, + Errors: []error{&ParseError{Err: ErrBareQuote}}, +}, { + Name: "ExtraneousQuote", + Input: `§"a ∑"word","b"`, + Errors: []error{&ParseError{Err: ErrQuote}}, +}, { + Name: "BadFieldCount", + Input: "§a,§b,§c\n¶∑§d,§e", + Errors: []error{nil, &ParseError{Err: ErrFieldCount}}, + Output: [][]string{{"a", "b", "c"}, {"d", "e"}}, + UseFieldsPerRecord: true, + FieldsPerRecord: 0, +}, { + Name: "BadFieldCountMultiple", + Input: "§a,§b,§c\n¶∑§d,§e\n¶∑§f", + Errors: []error{nil, &ParseError{Err: ErrFieldCount}, &ParseError{Err: ErrFieldCount}}, + Output: [][]string{{"a", "b", "c"}, {"d", "e"}, {"f"}}, + UseFieldsPerRecord: true, + FieldsPerRecord: 0, +}, { + Name: "BadFieldCount1", + Input: `§∑a,§b,§c`, + Errors: []error{&ParseError{Err: ErrFieldCount}}, + Output: [][]string{{"a", "b", "c"}}, + UseFieldsPerRecord: true, + FieldsPerRecord: 2, +}, { + Name: "FieldCount", + Input: "§a,§b,§c\n¶§d,§e", + Output: [][]string{{"a", "b", "c"}, {"d", "e"}}, +}, { + Name: "TrailingCommaEOF", + Input: "§a,§b,§c,§", + Output: [][]string{{"a", "b", "c", ""}}, +}, { + Name: "TrailingCommaEOL", + Input: "§a,§b,§c,§\n", + Output: [][]string{{"a", "b", "c", ""}}, +}, { + Name: "TrailingCommaSpaceEOF", + Input: "§a,§b,§c, §", + Output: [][]string{{"a", "b", "c", ""}}, + TrimLeadingSpace: true, +}, { + Name: "TrailingCommaSpaceEOL", + Input: "§a,§b,§c, §\n", + Output: [][]string{{"a", "b", "c", ""}}, + TrimLeadingSpace: true, +}, { + Name: "TrailingCommaLine3", + Input: "§a,§b,§c\n¶§d,§e,§f\n¶§g,§hi,§", + Output: [][]string{{"a", "b", "c"}, {"d", "e", "f"}, {"g", "hi", ""}}, + TrimLeadingSpace: true, +}, { + Name: "NotTrailingComma3", + Input: "§a,§b,§c,§ \n", + Output: [][]string{{"a", "b", "c", " "}}, +}, { + Name: "CommaFieldTest", + Input: `§x,§y,§z,§w +¶§x,§y,§z,§ +¶§x,§y,§,§ +¶§x,§,§,§ +¶§,§,§,§ +¶§"x",§"y",§"z",§"w" +¶§"x",§"y",§"z",§"" +¶§"x",§"y",§"",§"" +¶§"x",§"",§"",§"" +¶§"",§"",§"",§"" +`, + Output: [][]string{ + {"x", "y", "z", "w"}, + {"x", "y", "z", ""}, + {"x", "y", "", ""}, + {"x", "", "", ""}, + {"", "", "", ""}, + {"x", "y", "z", "w"}, + {"x", "y", "z", ""}, + {"x", "y", "", ""}, + {"x", "", "", ""}, + {"", "", "", ""}, + }, +}, { + Name: "TrailingCommaIneffective1", + Input: "§a,§b,§\n¶§c,§d,§e", + Output: [][]string{ + {"a", "b", ""}, + {"c", "d", "e"}, + }, + TrimLeadingSpace: true, +}, { + Name: "ReadAllReuseRecord", + Input: "§a,§b\n¶§c,§d", + Output: [][]string{ + {"a", "b"}, + {"c", "d"}, + }, + ReuseRecord: true, +}, { + Name: "StartLine1", // Issue 19019 + Input: "§a,\"b\nc∑\"d,e", + Errors: []error{&ParseError{Err: ErrQuote}}, +}, { + Name: "StartLine2", + Input: "§a,§b\n¶§\"d\n\n,e∑", + Errors: []error{nil, &ParseError{Err: ErrQuote}}, + Output: [][]string{{"a", "b"}}, +}, { + Name: "CRLFInQuotedField", // Issue 21201 + Input: "§A,§\"Hello\r\nHi\",§B\r\n", + Output: [][]string{ + {"A", "Hello\nHi", "B"}, + }, +}, { + Name: "BinaryBlobField", // Issue 19410 + Input: "§x09\x41\xb4\x1c,§aktau", + Output: [][]string{{"x09A\xb4\x1c", "aktau"}}, +}, { + Name: "TrailingCR", + Input: "§field1,§field2\r", + Output: [][]string{{"field1", "field2"}}, +}, { + Name: "QuotedTrailingCR", + Input: "§\"field\"\r", + Output: [][]string{{"field"}}, +}, { + Name: "QuotedTrailingCRCR", + Input: "§\"field∑\"\r\r", + Errors: []error{&ParseError{Err: ErrQuote}}, +}, { + Name: "FieldCR", + Input: "§field\rfield\r", + Output: [][]string{{"field\rfield"}}, +}, { + Name: "FieldCRCR", + Input: "§field\r\rfield\r\r", + Output: [][]string{{"field\r\rfield\r"}}, +}, { + Name: "FieldCRCRLF", + Input: "§field\r\r\n¶§field\r\r\n", + Output: [][]string{{"field\r"}, {"field\r"}}, +}, { + Name: "FieldCRCRLFCR", + Input: "§field\r\r\n¶§\rfield\r\r\n\r", + Output: [][]string{{"field\r"}, {"\rfield\r"}}, +}, { + Name: "FieldCRCRLFCRCR", + Input: "§field\r\r\n¶§\r\rfield\r\r\n¶§\r\r", + Output: [][]string{{"field\r"}, {"\r\rfield\r"}, {"\r"}}, +}, { + Name: "MultiFieldCRCRLFCRCR", + Input: "§field1,§field2\r\r\n¶§\r\rfield1,§field2\r\r\n¶§\r\r,§", + Output: [][]string{ + {"field1", "field2\r"}, + {"\r\rfield1", "field2\r"}, + {"\r\r", ""}, + }, +}, { + Name: "NonASCIICommaAndComment", + Input: "§a£§b,c£ \t§d,e\n€ comment\n", + Output: [][]string{{"a", "b,c", "d,e"}}, + TrimLeadingSpace: true, + Comma: '£', + Comment: '€', +}, { + Name: "NonASCIICommaAndCommentWithQuotes", + Input: "§a€§\" b,\"€§ c\nλ comment\n", + Output: [][]string{{"a", " b,", " c"}}, + Comma: '€', + Comment: 'λ', +}, { + // λ and θ start with the same byte. + // This tests that the parser doesn't confuse such characters. + Name: "NonASCIICommaConfusion", + Input: "§\"abθcd\"λ§efθgh", + Output: [][]string{{"abθcd", "efθgh"}}, + Comma: 'λ', + Comment: '€', +}, { + Name: "NonASCIICommentConfusion", + Input: "§λ\n¶§λ\nθ\n¶§λ\n", + Output: [][]string{{"λ"}, {"λ"}, {"λ"}}, + Comment: 'θ', +}, { + Name: "QuotedFieldMultipleLF", + Input: "§\"\n\n\n\n\"", + Output: [][]string{{"\n\n\n\n"}}, +}, { + Name: "MultipleCRLF", + Input: "\r\n\r\n\r\n\r\n", +}, { + // The implementation may read each line in several chunks if it doesn't fit entirely + // in the read buffer, so we should test the code to handle that condition. + Name: "HugeLines", + Input: strings.Repeat("#ignore\n", 10000) + "§" + strings.Repeat("@", 5000) + ",§" + strings.Repeat("*", 5000), + Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}}, + Comment: '#', +}, { + Name: "QuoteWithTrailingCRLF", + Input: "§\"foo∑\"bar\"\r\n", + Errors: []error{&ParseError{Err: ErrQuote}}, +}, { + Name: "LazyQuoteWithTrailingCRLF", + Input: "§\"foo\"bar\"\r\n", + Output: [][]string{{`foo"bar`}}, + LazyQuotes: true, +}, { + Name: "DoubleQuoteWithTrailingCRLF", + Input: "§\"foo\"\"bar\"\r\n", + Output: [][]string{{`foo"bar`}}, +}, { + Name: "EvenQuotes", + Input: `§""""""""`, + Output: [][]string{{`"""`}}, +}, { + Name: "OddQuotes", + Input: `§"""""""∑`, + Errors: []error{&ParseError{Err: ErrQuote}}, +}, { + Name: "LazyOddQuotes", + Input: `§"""""""`, + Output: [][]string{{`"""`}}, + LazyQuotes: true, +}, { + Name: "BadComma1", + Comma: '\n', + Errors: []error{errInvalidDelim}, +}, { + Name: "BadComma2", + Comma: '\r', + Errors: []error{errInvalidDelim}, +}, { + Name: "BadComma3", + Comma: '"', + Errors: []error{errInvalidDelim}, +}, { + Name: "BadComma4", + Comma: utf8.RuneError, + Errors: []error{errInvalidDelim}, +}, { + Name: "BadComment1", + Comment: '\n', + Errors: []error{errInvalidDelim}, +}, { + Name: "BadComment2", + Comment: '\r', + Errors: []error{errInvalidDelim}, +}, { + Name: "BadComment3", + Comment: utf8.RuneError, + Errors: []error{errInvalidDelim}, +}, { + Name: "BadCommaComment", + Comma: 'X', + Comment: 'X', + Errors: []error{errInvalidDelim}, +}, { + Name: "QuoteCharacter", + Input: "§`a`,§`b`\n¶§`c`,§`d`,§`e`", + Output: [][]string{ + {"a", "b"}, + {"c", "d", "e"}, + }, + Quote: '`', +}} + +func TestRead(t *testing.T) { + newReader := func(tt readTest) (*Reader, [][][2]int, map[int][2]int, string) { + positions, errPositions, input := makePositions(tt.Input) + r := NewReader(strings.NewReader(input)) + + if tt.Comma != 0 { + r.Comma = tt.Comma + } + if tt.Quote != 0 { + r.Quote = tt.Quote + } + r.Comment = tt.Comment + if tt.UseFieldsPerRecord { + r.FieldsPerRecord = tt.FieldsPerRecord + } else { + r.FieldsPerRecord = -1 + } + r.LazyQuotes = tt.LazyQuotes + r.TrimLeadingSpace = tt.TrimLeadingSpace + r.ReuseRecord = tt.ReuseRecord + return r, positions, errPositions, input + } + + for _, tt := range readTests { + t.Run(tt.Name, func(t *testing.T) { + r, positions, errPositions, input := newReader(tt) + out, err := r.ReadAll() + if wantErr := firstError(tt.Errors, positions, errPositions); wantErr != nil { + if !reflect.DeepEqual(err, wantErr) { + t.Fatalf("ReadAll() error mismatch:\ngot %v (%#v)\nwant %v (%#v)", err, err, wantErr, wantErr) + } + if out != nil { + t.Fatalf("ReadAll() output:\ngot %q\nwant nil", out) + } + } else { + if err != nil { + t.Fatalf("unexpected Readall() error: %v", err) + } + if !reflect.DeepEqual(out, tt.Output) { + t.Fatalf("ReadAll() output:\ngot %q\nwant %q", out, tt.Output) + } + } + + // Check input offset after call ReadAll() + inputByteSize := len(input) + inputOffset := r.InputOffset() + if err == nil && int64(inputByteSize) != inputOffset { + t.Errorf("wrong input offset after call ReadAll():\ngot: %d\nwant: %d\ninput: %s", inputOffset, inputByteSize, input) + } + + // Check field and error positions. + r, _, _, _ = newReader(tt) + for recNum := 0; ; recNum++ { + rec, err := r.Read() + var wantErr error + if recNum < len(tt.Errors) && tt.Errors[recNum] != nil { + wantErr = errorWithPosition(tt.Errors[recNum], recNum, positions, errPositions) + } else if recNum >= len(tt.Output) { + wantErr = io.EOF + } + if !reflect.DeepEqual(err, wantErr) { + t.Fatalf("Read() error at record %d:\ngot %v (%#v)\nwant %v (%#v)", recNum, err, err, wantErr, wantErr) + } + // ErrFieldCount is explicitly non-fatal. + if err != nil && !errors.Is(err, ErrFieldCount) { + if recNum < len(tt.Output) { + t.Fatalf("need more records; got %d want %d", recNum, len(tt.Output)) + } + break + } + if got, want := rec, tt.Output[recNum]; !reflect.DeepEqual(got, want) { + t.Errorf("Read vs ReadAll mismatch;\ngot %q\nwant %q", got, want) + } + pos := positions[recNum] + if len(pos) != len(rec) { + t.Fatalf("mismatched position length at record %d", recNum) + } + for i := range rec { + line, col := r.FieldPos(i) + if got, want := [2]int{line, col}, pos[i]; got != want { + t.Errorf("position mismatch at record %d, field %d;\ngot %v\nwant %v", recNum, i, got, want) + } + } + } + }) + } +} + +// firstError returns the first non-nil error in errs, +// with the position adjusted according to the error's +// index inside positions. +func firstError(errs []error, positions [][][2]int, errPositions map[int][2]int) error { + for i, err := range errs { + if err != nil { + return errorWithPosition(err, i, positions, errPositions) + } + } + return nil +} + +func errorWithPosition(err error, recNum int, positions [][][2]int, errPositions map[int][2]int) error { + parseErr, ok := err.(*ParseError) + if !ok { + return err + } + if recNum >= len(positions) { + panic(fmt.Errorf("no positions found for error at record %d", recNum)) + } + errPos, ok := errPositions[recNum] + if !ok { + panic(fmt.Errorf("no error position found for error at record %d", recNum)) + } + parseErr1 := *parseErr + parseErr1.StartLine = positions[recNum][0][0] + parseErr1.Line = errPos[0] + parseErr1.Column = errPos[1] + return &parseErr1 +} + +// makePositions returns the expected field positions of all +// the fields in text, the positions of any errors, and the text with the position markers +// removed. +// +// The start of each field is marked with a § symbol; +// CSV lines are separated by ¶ symbols; +// Error positions are marked with ∑ symbols. +func makePositions(text string) ([][][2]int, map[int][2]int, string) { + buf := make([]byte, 0, len(text)) + var positions [][][2]int + errPositions := make(map[int][2]int) + line, col := 1, 1 + recNum := 0 + + for len(text) > 0 { + r, size := utf8.DecodeRuneInString(text) + switch r { + case '\n': + line++ + col = 1 + buf = append(buf, '\n') + case '§': + if len(positions) == 0 { + positions = append(positions, [][2]int{}) + } + positions[len(positions)-1] = append(positions[len(positions)-1], [2]int{line, col}) + case '¶': + positions = append(positions, [][2]int{}) + recNum++ + case '∑': + errPositions[recNum] = [2]int{line, col} + default: + buf = append(buf, text[:size]...) + col += size + } + text = text[size:] + } + return positions, errPositions, string(buf) +} + +// nTimes is an io.Reader which yields the string s n times. +type nTimes struct { + s string + n int + off int +} + +func (r *nTimes) Read(p []byte) (n int, err error) { + for { + if r.n <= 0 || r.s == "" { + return n, io.EOF + } + n0 := copy(p, r.s[r.off:]) + p = p[n0:] + n += n0 + r.off += n0 + if r.off == len(r.s) { + r.off = 0 + r.n-- + } + if len(p) == 0 { + return + } + } +} + +// benchmarkRead measures reading the provided CSV rows data. +// initReader, if non-nil, modifies the Reader before it's used. +func benchmarkRead(b *testing.B, initReader func(*Reader), rows string) { + b.ReportAllocs() + r := NewReader(&nTimes{s: rows, n: b.N}) + if initReader != nil { + initReader(r) + } + for { + _, err := r.Read() + if err == io.EOF { + break + } + if err != nil { + b.Fatal(err) + } + } +} + +const benchmarkCSVData = `x,y,z,w +x,y,z, +x,y,, +x,,, +,,, +"x","y","z","w" +"x","y","z","" +"x","y","","" +"x","","","" +"","","","" +` + +func BenchmarkRead(b *testing.B) { + benchmarkRead(b, nil, benchmarkCSVData) +} + +func BenchmarkReadWithFieldsPerRecord(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.FieldsPerRecord = 4 }, benchmarkCSVData) +} + +func BenchmarkReadWithoutFieldsPerRecord(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.FieldsPerRecord = -1 }, benchmarkCSVData) +} + +func BenchmarkReadLargeFields(b *testing.B) { + benchmarkRead(b, nil, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv +xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv +,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv +`, 3)) +} + +func BenchmarkReadReuseRecord(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, benchmarkCSVData) +} + +func BenchmarkReadReuseRecordWithFieldsPerRecord(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = 4 }, benchmarkCSVData) +} + +func BenchmarkReadReuseRecordWithoutFieldsPerRecord(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = -1 }, benchmarkCSVData) +} + +func BenchmarkReadReuseRecordLargeFields(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv +xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv +,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv +`, 3)) +} diff --git a/internal/csvex/write_test.go b/internal/csvex/write_test.go new file mode 100644 index 000000000..39a46a858 --- /dev/null +++ b/internal/csvex/write_test.go @@ -0,0 +1,124 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// skip to keep code similar upstream +// +//nolint:errorlint,errcheck +package csvex + +import ( + "bytes" + "errors" + "strings" + "testing" +) + +var writeTests = []struct { + Input [][]string + Output string + Error error + UseCRLF bool + Comma rune + Quote rune +}{ + {Input: [][]string{{"abc"}}, Output: "abc\n"}, + {Input: [][]string{{"abc"}}, Output: "abc\r\n", UseCRLF: true}, + {Input: [][]string{{`"abc"`}}, Output: `"""abc"""` + "\n"}, + {Input: [][]string{{`a"b`}}, Output: `"a""b"` + "\n"}, + {Input: [][]string{{`"a"b"`}}, Output: `"""a""b"""` + "\n"}, + {Input: [][]string{{" abc"}}, Output: `" abc"` + "\n"}, + {Input: [][]string{{"abc,def"}}, Output: `"abc,def"` + "\n"}, + {Input: [][]string{{"abc", "def"}}, Output: "abc,def\n"}, + {Input: [][]string{{"abc"}, {"def"}}, Output: "abc\ndef\n"}, + {Input: [][]string{{"abc\ndef"}}, Output: "\"abc\ndef\"\n"}, + {Input: [][]string{{"abc\ndef"}}, Output: "\"abc\r\ndef\"\r\n", UseCRLF: true}, + {Input: [][]string{{"abc\rdef"}}, Output: "\"abcdef\"\r\n", UseCRLF: true}, + {Input: [][]string{{"abc\rdef"}}, Output: "\"abc\rdef\"\n", UseCRLF: false}, + {Input: [][]string{{""}}, Output: "\n"}, + {Input: [][]string{{"", ""}}, Output: ",\n"}, + {Input: [][]string{{"", "", ""}}, Output: ",,\n"}, + {Input: [][]string{{"", "", "a"}}, Output: ",,a\n"}, + {Input: [][]string{{"", "a", ""}}, Output: ",a,\n"}, + {Input: [][]string{{"", "a", "a"}}, Output: ",a,a\n"}, + {Input: [][]string{{"a", "", ""}}, Output: "a,,\n"}, + {Input: [][]string{{"a", "", "a"}}, Output: "a,,a\n"}, + {Input: [][]string{{"a", "a", ""}}, Output: "a,a,\n"}, + {Input: [][]string{{"a", "a", "a"}}, Output: "a,a,a\n"}, + {Input: [][]string{{`\.`}}, Output: "\"\\.\"\n"}, + {Input: [][]string{{"x09\x41\xb4\x1c", "aktau"}}, Output: "x09\x41\xb4\x1c,aktau\n"}, + {Input: [][]string{{",x09\x41\xb4\x1c", "aktau"}}, Output: "\",x09\x41\xb4\x1c\",aktau\n"}, + {Input: [][]string{{"a", "a", ""}}, Output: "a|a|\n", Comma: '|'}, + {Input: [][]string{{",", ",", ""}}, Output: ",|,|\n", Comma: '|'}, + {Input: [][]string{{"foo"}}, Comma: '"', Error: errInvalidDelim}, + + {Input: [][]string{{"`abc`"}}, Output: "```abc```" + "\n", Quote: '`'}, + {Input: [][]string{{"a`b"}}, Output: "`a``b`" + "\n", Quote: '`'}, + {Input: [][]string{{"`a`b`"}}, Output: "```a``b```" + "\n", Quote: '`'}, +} + +func TestWrite(t *testing.T) { + for n, tt := range writeTests { + b := &strings.Builder{} + f := NewWriter(b) + f.UseCRLF = tt.UseCRLF + if tt.Comma != 0 { + f.Comma = tt.Comma + } + if tt.Quote != 0 { + f.Quote = tt.Quote + } + err := f.WriteAll(tt.Input) + if err != tt.Error { + t.Errorf("Unexpected error:\ngot %v\nwant %v", err, tt.Error) + } + out := b.String() + if out != tt.Output { + t.Errorf("#%d: out=%q want %q", n, out, tt.Output) + } + } +} + +type errorWriter struct{} + +func (e errorWriter) Write(b []byte) (int, error) { + return 0, errors.New("Test") +} + +func TestError(t *testing.T) { + b := &bytes.Buffer{} + f := NewWriter(b) + f.Write([]string{"abc"}) + f.Flush() + err := f.Error() + + if err != nil { + t.Errorf("Unexpected error: %s\n", err) + } + + f = NewWriter(errorWriter{}) + f.Write([]string{"abc"}) + f.Flush() + err = f.Error() + + if err == nil { + t.Error("Error should not be nil") + } +} + +var benchmarkWriteData = [][]string{ + {"abc", "def", "12356", "1234567890987654311234432141542132"}, + {"abc", "def", "12356", "1234567890987654311234432141542132"}, + {"abc", "def", "12356", "1234567890987654311234432141542132"}, +} + +func BenchmarkWrite(b *testing.B) { + for i := 0; i < b.N; i++ { + w := NewWriter(&bytes.Buffer{}) + err := w.WriteAll(benchmarkWriteData) + if err != nil { + b.Fatal(err) + } + w.Flush() + } +} diff --git a/internal/csvex/writer.go b/internal/csvex/writer.go new file mode 100644 index 000000000..2460b535c --- /dev/null +++ b/internal/csvex/writer.go @@ -0,0 +1,186 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package csvex + +import ( + "bufio" + "io" + "strings" + "unicode" + "unicode/utf8" +) + +// A Writer writes records using CSV encoding. +// +// As returned by NewWriter, a Writer writes records terminated by a +// newline and uses ',' as the field delimiter. The exported fields can be +// changed to customize the details before the first call to Write or WriteAll. +// +// Comma is the field delimiter. +// +// If UseCRLF is true, the Writer ends each output line with \r\n instead of \n. +// +// The writes of individual records are buffered. +// After all data has been written, the client should call the +// Flush method to guarantee all data has been forwarded to +// the underlying io.Writer. Any errors that occurred should +// be checked by calling the Error method. +type Writer struct { + Comma rune // Field delimiter (set to ',' by NewWriter) + UseCRLF bool // True to use \r\n as the line terminator + Quote rune // Quote character + w *bufio.Writer +} + +// NewWriter returns a new Writer that writes to w. +func NewWriter(w io.Writer) *Writer { + return &Writer{ + Comma: ',', + Quote: '"', + w: bufio.NewWriter(w), + } +} + +// Write writes a single CSV record to w along with any necessary quoting. +// A record is a slice of strings with each string being one field. +// Writes are buffered, so Flush must eventually be called to ensure +// that the record is written to the underlying io.Writer. +func (w *Writer) Write(record []string) error { + if !validDelim(w.Comma, w.Quote) { + return errInvalidDelim + } + + quoteStr := string(w.Quote) + specialChars := quoteStr + "\r\n" + + for n, field := range record { + if n > 0 { + if _, err := w.w.WriteRune(w.Comma); err != nil { + return err + } + } + + // If we don't have to have a quoted field then just + // write out the field and continue to the next field. + if !w.fieldNeedsQuotes(field) { + if _, err := w.w.WriteString(field); err != nil { + return err + } + continue + } + + if _, err := w.w.WriteRune(w.Quote); err != nil { + return err + } + for len(field) > 0 { + // Search for special characters. + i := strings.IndexAny(field, specialChars) + if i < 0 { + i = len(field) + } + + // Copy verbatim everything before the special character. + if _, err := w.w.WriteString(field[:i]); err != nil { + return err + } + field = field[i:] + + // Encode the special character. + if len(field) > 0 { + var err error + switch { + case strings.HasPrefix(field, quoteStr): + _, err = w.w.WriteString(quoteStr + quoteStr) + case field[0] == '\r': + if !w.UseCRLF { + err = w.w.WriteByte('\r') + } + case field[0] == '\n': + if w.UseCRLF { + _, err = w.w.WriteString("\r\n") + } else { + err = w.w.WriteByte('\n') + } + } + field = field[1:] + if err != nil { + return err + } + } + } + if _, err := w.w.WriteRune(w.Quote); err != nil { + return err + } + } + var err error + if w.UseCRLF { + _, err = w.w.WriteString("\r\n") + } else { + err = w.w.WriteByte('\n') + } + return err +} + +// Flush writes any buffered data to the underlying io.Writer. +// To check if an error occurred during the Flush, call Error. +func (w *Writer) Flush() { + w.w.Flush() +} + +// Error reports any error that has occurred during a previous Write or Flush. +func (w *Writer) Error() error { + _, err := w.w.Write(nil) + return err +} + +// WriteAll writes multiple CSV records to w using Write and then calls Flush, +// returning any error from the Flush. +func (w *Writer) WriteAll(records [][]string) error { + for _, record := range records { + err := w.Write(record) + if err != nil { + return err + } + } + return w.w.Flush() +} + +// fieldNeedsQuotes reports whether our field must be enclosed in quotes. +// Fields with a Comma, fields with a quote or newline, and +// fields which start with a space must be enclosed in quotes. +// We used to quote empty strings, but we do not anymore (as of Go 1.4). +// The two representations should be equivalent, but Postgres distinguishes +// quoted vs non-quoted empty string during database imports, and it has +// an option to force the quoted behavior for non-quoted CSV but it has +// no option to force the non-quoted behavior for quoted CSV, making +// CSV with quoted empty strings strictly less useful. +// Not quoting the empty string also makes this package match the behavior +// of Microsoft Excel and Google Drive. +// For Postgres, quote the data terminating string `\.`. +func (w *Writer) fieldNeedsQuotes(field string) bool { + if field == "" { + return false + } + + if field == `\.` { + return true + } + + if w.Comma < utf8.RuneSelf { + for i := 0; i < len(field); i++ { + c := field[i] + if c == '\n' || c == '\r' || c == byte(w.Quote) || c == byte(w.Comma) { + return true + } + } + } else { + if strings.ContainsRune(field, w.Comma) || strings.ContainsAny(field, "\"\r\n") { + return true + } + } + + r1, _ := utf8.DecodeRuneInString(field) + return unicode.IsSpace(r1) +}