Skip to content

Commit

Permalink
locations for tokens and error messages
Browse files Browse the repository at this point in the history
  • Loading branch information
bcpeinhardt committed Dec 29, 2023
1 parent 3ca2c68 commit 8b19313
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 29 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@

## Unreleased

## v1.1.0 - 29 December 2023
- Add a function which panics with an appropriate error message on failure to
parse csv. This includes the token location.

## v1.0.0 - 27 December 2023
- Init changelog w/v1 so people's stuff doesn't break.
2 changes: 1 addition & 1 deletion gleam.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = "gsv"
version = "1.0.0"
version = "1.1.0"
gleam = ">= 0.32.0"
description = "A simple csv parser and generator written in gleam "

Expand Down
31 changes: 29 additions & 2 deletions src/gsv.gleam
Original file line number Diff line number Diff line change
@@ -1,14 +1,41 @@
import gsv/internal/ast
import gsv/internal/token
import gsv/internal/ast.{ParseError}
import gsv/internal/token.{Location}
import gleam/list
import gleam/string
import gleam/result
import gleam/int

/// Parses a csv string to a list of lists of strings.
/// Automatically handles Windows and Unix line endings.
pub fn to_lists(input: String) -> Result(List(List(String)), Nil) {
input
|> token.scan
|> token.with_location
|> ast.parse
|> result.nil_error
}

/// Parses a csv string to a list of lists of strings.
/// Automatically handles Windows and Unix line endings.
/// Panics with an error msg if the string is not valid csv.
pub fn to_lists_or_panic(input: String) -> List(List(String)) {
let res =
input
|> token.scan
|> token.with_location
|> ast.parse

case res {
Ok(lol) -> lol
Error(ParseError(Location(line, column), msg)) -> {
panic as {
"[" <> "line " <> int.to_string(line) <> " column " <> int.to_string(
column,
) <> "] of csv: " <> msg
}
[[]]
}
}
}

/// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows"
Expand Down
81 changes: 57 additions & 24 deletions src/gsv/internal/ast.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

import gleam/list
import gleam/result
import gsv/internal/token.{type CsvToken, CR, Comma, Doublequote, LF, Textdata}
import gsv/internal/token.{
type CsvToken, type Location, CR, Comma, Doublequote, LF, Location, Textdata,
}

type ParseState {
Beginning
Expand All @@ -21,7 +23,13 @@ type ParseState {
InsideEscapedString
}

pub fn parse(input: List(CsvToken)) -> Result(List(List(String)), Nil) {
pub type ParseError {
ParseError(location: Location, message: String)
}

pub fn parse(
input: List(#(CsvToken, Location)),
) -> Result(List(List(String)), ParseError) {
let inner_rev = {
use llf <- result.try(parse_p(input, Beginning, []))
use lf <- list.try_map(llf)
Expand All @@ -32,46 +40,58 @@ pub fn parse(input: List(CsvToken)) -> Result(List(List(String)), Nil) {
}

fn parse_p(
input: List(CsvToken),
input: List(#(CsvToken, Location)),
parse_state: ParseState,
llf: List(List(String)),
) -> Result(List(List(String)), Nil) {
) -> Result(List(List(String)), ParseError) {
case input, parse_state, llf {
// Error Case: An empty list should produce an Error
[], Beginning, _ -> Error(Nil)
[], Beginning, _ -> Error(ParseError(Location(0, 0), "Empty input"))

// BASE CASE: We are done parsing tokens
[], _, llf -> Ok(llf)

// File should begin with either Escaped or Nonescaped string
[Textdata(str), ..remaining_tokens], Beginning, [] ->
[#(Textdata(str), _), ..remaining_tokens], Beginning, [] ->
parse_p(remaining_tokens, JustParsedField, [[str]])

[Doublequote, ..remaining_tokens], Beginning, [] ->
[#(Doublequote, _), ..remaining_tokens], Beginning, [] ->
parse_p(remaining_tokens, InsideEscapedString, [[""]])

_, Beginning, _ -> Error(Nil)
[#(tok, loc), ..], Beginning, _ ->
Error(ParseError(
loc,
"Unexpected start to csv content: " <> token.to_lexeme(tok),
))

// If we just parsed a field, we're expecting either a comma or a CRLF
[Comma, ..remaining_tokens], JustParsedField, llf ->
[#(Comma, _), ..remaining_tokens], JustParsedField, llf ->
parse_p(remaining_tokens, JustParsedComma, llf)

[LF, ..remaining_tokens], JustParsedField, llf ->
[#(LF, _), ..remaining_tokens], JustParsedField, llf ->
parse_p(remaining_tokens, JustParsedNewline, llf)

[CR, ..remaining_tokens], JustParsedField, llf ->
[#(CR, _), ..remaining_tokens], JustParsedField, llf ->
parse_p(remaining_tokens, JustParsedCR, llf)

_, JustParsedField, _ -> Error(Nil)
[#(tok, loc), ..], JustParsedField, _ ->
Error(ParseError(
loc,
"Expected comma or newline after field, found: " <> token.to_lexeme(tok),
))

// If we just parsed a CR, we're expecting an LF
[LF, ..remaining_tokens], JustParsedCR, llf ->
[#(LF, _), ..remaining_tokens], JustParsedCR, llf ->
parse_p(remaining_tokens, JustParsedNewline, llf)

_, JustParsedCR, _ -> Error(Nil)
[#(tok, loc), ..], JustParsedCR, _ ->
Error(ParseError(
loc,
"Expected \"\\n\" after \"\\r\", found: " <> token.to_lexeme(tok),
))

// If we just parsed a comma, we're expecting an Escaped or Non-Escaped string
[Textdata(str), ..remaining_tokens], JustParsedComma, [
[#(Textdata(str), _), ..remaining_tokens], JustParsedComma, [
curr_line,
..previously_parsed_lines
] ->
Expand All @@ -80,7 +100,7 @@ fn parse_p(
..previously_parsed_lines
])

[Doublequote, ..remaining_tokens], JustParsedComma, [
[#(Doublequote, _), ..remaining_tokens], JustParsedComma, [
curr_line,
..previously_parsed_lines
] ->
Expand All @@ -89,13 +109,19 @@ fn parse_p(
..previously_parsed_lines
])

_, JustParsedComma, _ -> Error(Nil)
[#(tok, loc), ..], JustParsedComma, _ ->
Error(ParseError(
loc,
"Expected escaped or non-escaped string after comma, found: " <> token.to_lexeme(
tok,
),
))

// If we just parsed a new line, we're expecting an escaped or non-escaped string
[Textdata(str), ..remaining_tokens], JustParsedNewline, llf ->
[#(Textdata(str), _), ..remaining_tokens], JustParsedNewline, llf ->
parse_p(remaining_tokens, JustParsedField, [[str], ..llf])

[Doublequote, ..remaining_tokens], JustParsedNewline, [
[#(Doublequote, _), ..remaining_tokens], JustParsedNewline, [
curr_line,
..previously_parsed_lines
] ->
Expand All @@ -104,11 +130,17 @@ fn parse_p(
..previously_parsed_lines
])

_, JustParsedNewline, _ -> Error(Nil)
[#(tok, loc), ..], JustParsedNewline, _ ->
Error(ParseError(
loc,
"Expected escaped or non-escaped string after newline, found: " <> token.to_lexeme(
tok,
),
))

// If we're inside an escaped string, we can take anything until we get a double quote,
// but a double double quote "" escapes the double quote and we keep parsing
[Doublequote, Doublequote, ..remaining_tokens], InsideEscapedString, [
[#(Doublequote, _), #(Doublequote, _), ..remaining_tokens], InsideEscapedString, [
[str, ..rest_curr_line],
..previously_parsed_lines
] ->
Expand All @@ -117,10 +149,10 @@ fn parse_p(
..previously_parsed_lines
])

[Doublequote, ..remaining_tokens], InsideEscapedString, llf ->
[#(Doublequote, _), ..remaining_tokens], InsideEscapedString, llf ->
parse_p(remaining_tokens, JustParsedField, llf)

[other_token, ..remaining_tokens], InsideEscapedString, [
[#(other_token, _), ..remaining_tokens], InsideEscapedString, [
[str, ..rest_curr_line],
..previously_parsed_lines
] ->
Expand All @@ -130,6 +162,7 @@ fn parse_p(
])

// Anything else is an error
_, _, _ -> Error(Nil)
[#(tok, loc), ..], _, _ ->
Error(ParseError(loc, "Unexpected token: " <> token.to_lexeme(tok)))
}
}
52 changes: 52 additions & 0 deletions src/gsv/internal/token.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ pub type CsvToken {
Textdata(inner: String)
}

pub type Location {
Location(line: Int, column: Int)
}

pub fn to_lexeme(token: CsvToken) -> String {
case token {
Comma -> ","
Expand All @@ -29,6 +33,16 @@ pub fn to_lexeme(token: CsvToken) -> String {
}
}

fn len(token: CsvToken) -> Int {
case token {
Comma -> 1
LF -> 1
CR -> 1
Doublequote -> 1
Textdata(str) -> string.length(str)
}
}

pub fn scan(input: String) -> List(CsvToken) {
input
|> string.to_utf_codepoints
Expand All @@ -49,3 +63,41 @@ pub fn scan(input: String) -> List(CsvToken) {
})
|> list.reverse
}

pub fn with_location(input: List(CsvToken)) -> List(#(CsvToken, Location)) {
do_with_location(input, [], Location(1, 1))
|> list.reverse
}

fn do_with_location(
input: List(CsvToken),
acc: List(#(CsvToken, Location)),
curr_loc: Location,
) -> List(#(CsvToken, Location)) {
let Location(line, column) = curr_loc
case input {
// Base case, no more tokens
[] -> acc

// A newline, increment line number
[LF, ..rest] -> {
do_with_location(rest, [#(LF, curr_loc), ..acc], Location(line + 1, 1))
}
[CR, LF, ..rest] -> {
do_with_location(
rest,
[#(LF, Location(line, column + 1)), #(CR, curr_loc), ..acc],
Location(line + 1, 1),
)
}

// Any other token just increment the column
[token, ..rest] -> {
do_with_location(
rest,
[#(token, curr_loc), ..acc],
Location(line, column + len(token)),
)
}
}
}
37 changes: 35 additions & 2 deletions test/gsv_test.gleam
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import gleeunit
import gleeunit/should
import gsv/internal/token.{CR, Comma, Doublequote, LF, Textdata, scan}
import gsv/internal/ast.{parse}
import gsv/internal/token.{
CR, Comma, Doublequote, LF, Location, Textdata, scan, with_location,
}
import gsv/internal/ast.{ParseError, parse}
import gsv.{Unix, Windows}
import gleam/list
import gleam/result
Expand Down Expand Up @@ -31,6 +33,7 @@ pub fn scan_test() {
pub fn parse_test() {
"Ben, 25,\" TRUE\n\r\"\"\"\nAustin, 25, FALSE"
|> scan
|> with_location
|> parse
|> should.equal(
Ok([["Ben", " 25", " TRUE\n\r\""], ["Austin", " 25", " FALSE"]]),
Expand All @@ -40,7 +43,9 @@ pub fn parse_test() {
pub fn parse_empty_string_fail_test() {
""
|> scan
|> with_location
|> parse
|> result.nil_error
|> should.equal(Error(Nil))
}

Expand Down Expand Up @@ -140,3 +145,31 @@ pub fn for_the_readme_test() {
|> gsv.from_lists(separator: ",", line_ending: Windows)
|> should.equal("Hello, World\r\nGoodbye, Mars")
}

pub fn error_cases_test() {
let produce_error = fn(csv_str) {
case
csv_str
|> scan
|> with_location
|> parse
{
Ok(_) -> panic as "Expected an error"
Error(ParseError(loc, msg)) -> #(loc, msg)
}
}

produce_error("Ben, 25,, TRUE")
|> should.equal(#(
Location(1, 9),
"Expected escaped or non-escaped string after comma, found: ,",
))
produce_error("Austin, 25, FALSE\n\"Ben Peinhardt\", 25,, TRUE")
|> should.equal(#(
Location(2, 21),
"Expected escaped or non-escaped string after comma, found: ,",
))
}
// pub fn totally_panics_test() {
// "Ben, 25,, TRUE" |> gsv.to_lists_or_panic
// }

0 comments on commit 8b19313

Please sign in to comment.