locations for tokens and error messages

bcpeinhardt · Dec 29, 2023 · 8b19313 · 8b19313
1 parent 3ca2c68
commit 8b19313
Show file tree

Hide file tree

Showing 6 changed files with 178 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,5 +2,9 @@
 
 ## Unreleased
 
+## v1.1.0 - 29 December 2023
+- Add a function which panics with an appropriate error message on failure to 
+  parse csv. This includes the token location.
+
 ## v1.0.0 - 27 December 2023
 - Init changelog w/v1 so people's stuff doesn't break.
diff --git a/gleam.toml b/gleam.toml
@@ -1,5 +1,5 @@
 name = "gsv"
-version = "1.0.0"
+version = "1.1.0"
 gleam = ">= 0.32.0"
 description = "A simple csv parser and generator written in gleam "
 

diff --git a/src/gsv.gleam b/src/gsv.gleam
@@ -1,14 +1,41 @@
-import gsv/internal/ast
-import gsv/internal/token
+import gsv/internal/ast.{ParseError}
+import gsv/internal/token.{Location}
 import gleam/list
 import gleam/string
+import gleam/result
+import gleam/int
 
 /// Parses a csv string to a list of lists of strings.
 /// Automatically handles Windows and Unix line endings.
 pub fn to_lists(input: String) -> Result(List(List(String)), Nil) {
   input
   |> token.scan
+  |> token.with_location
   |> ast.parse
+  |> result.nil_error
+}
+
+/// Parses a csv string to a list of lists of strings.
+/// Automatically handles Windows and Unix line endings.
+/// Panics with an error msg if the string is not valid csv.
+pub fn to_lists_or_panic(input: String) -> List(List(String)) {
+  let res =
+    input
+    |> token.scan
+    |> token.with_location
+    |> ast.parse
+
+  case res {
+    Ok(lol) -> lol
+    Error(ParseError(Location(line, column), msg)) -> {
+      panic as {
+        "[" <> "line " <> int.to_string(line) <> " column " <> int.to_string(
+          column,
+        ) <> "] of csv: " <> msg
+      }
+      [[]]
+    }
+  }
 }
 
 /// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows"

diff --git a/src/gsv/internal/ast.gleam b/src/gsv/internal/ast.gleam
@@ -10,7 +10,9 @@
 
 import gleam/list
 import gleam/result
-import gsv/internal/token.{type CsvToken, CR, Comma, Doublequote, LF, Textdata}
+import gsv/internal/token.{
+  type CsvToken, type Location, CR, Comma, Doublequote, LF, Location, Textdata,
+}
 
 type ParseState {
   Beginning
@@ -21,7 +23,13 @@ type ParseState {
   InsideEscapedString
 }
 
-pub fn parse(input: List(CsvToken)) -> Result(List(List(String)), Nil) {
+pub type ParseError {
+  ParseError(location: Location, message: String)
+}
+
+pub fn parse(
+  input: List(#(CsvToken, Location)),
+) -> Result(List(List(String)), ParseError) {
   let inner_rev = {
     use llf <- result.try(parse_p(input, Beginning, []))
     use lf <- list.try_map(llf)
@@ -32,46 +40,58 @@ pub fn parse(input: List(CsvToken)) -> Result(List(List(String)), Nil) {
 }
 
 fn parse_p(
-  input: List(CsvToken),
+  input: List(#(CsvToken, Location)),
   parse_state: ParseState,
   llf: List(List(String)),
-) -> Result(List(List(String)), Nil) {
+) -> Result(List(List(String)), ParseError) {
   case input, parse_state, llf {
     // Error Case: An empty list should produce an Error
-    [], Beginning, _ -> Error(Nil)
+    [], Beginning, _ -> Error(ParseError(Location(0, 0), "Empty input"))
 
     // BASE CASE: We are done parsing tokens
     [], _, llf -> Ok(llf)
 
     // File should begin with either Escaped or Nonescaped string
-    [Textdata(str), ..remaining_tokens], Beginning, [] ->
+    [#(Textdata(str), _), ..remaining_tokens], Beginning, [] ->
       parse_p(remaining_tokens, JustParsedField, [[str]])
 
-    [Doublequote, ..remaining_tokens], Beginning, [] ->
+    [#(Doublequote, _), ..remaining_tokens], Beginning, [] ->
       parse_p(remaining_tokens, InsideEscapedString, [[""]])
 
-    _, Beginning, _ -> Error(Nil)
+    [#(tok, loc), ..], Beginning, _ ->
+      Error(ParseError(
+        loc,
+        "Unexpected start to csv content: " <> token.to_lexeme(tok),
+      ))
 
     // If we just parsed a field, we're expecting either a comma or a CRLF
-    [Comma, ..remaining_tokens], JustParsedField, llf ->
+    [#(Comma, _), ..remaining_tokens], JustParsedField, llf ->
       parse_p(remaining_tokens, JustParsedComma, llf)
 
-    [LF, ..remaining_tokens], JustParsedField, llf ->
+    [#(LF, _), ..remaining_tokens], JustParsedField, llf ->
       parse_p(remaining_tokens, JustParsedNewline, llf)
 
-    [CR, ..remaining_tokens], JustParsedField, llf ->
+    [#(CR, _), ..remaining_tokens], JustParsedField, llf ->
       parse_p(remaining_tokens, JustParsedCR, llf)
 
-    _, JustParsedField, _ -> Error(Nil)
+    [#(tok, loc), ..], JustParsedField, _ ->
+      Error(ParseError(
+        loc,
+        "Expected comma or newline after field, found: " <> token.to_lexeme(tok),
+      ))
 
     // If we just parsed a CR, we're expecting an LF
-    [LF, ..remaining_tokens], JustParsedCR, llf ->
+    [#(LF, _), ..remaining_tokens], JustParsedCR, llf ->
       parse_p(remaining_tokens, JustParsedNewline, llf)
 
-    _, JustParsedCR, _ -> Error(Nil)
+    [#(tok, loc), ..], JustParsedCR, _ ->
+      Error(ParseError(
+        loc,
+        "Expected \"\\n\" after \"\\r\", found: " <> token.to_lexeme(tok),
+      ))
 
     // If we just parsed a comma, we're expecting an Escaped or Non-Escaped string
-    [Textdata(str), ..remaining_tokens], JustParsedComma, [
+    [#(Textdata(str), _), ..remaining_tokens], JustParsedComma, [
       curr_line,
       ..previously_parsed_lines
     ] ->
@@ -80,7 +100,7 @@ fn parse_p(
         ..previously_parsed_lines
       ])
 
-    [Doublequote, ..remaining_tokens], JustParsedComma, [
+    [#(Doublequote, _), ..remaining_tokens], JustParsedComma, [
       curr_line,
       ..previously_parsed_lines
     ] ->
@@ -89,13 +109,19 @@ fn parse_p(
         ..previously_parsed_lines
       ])
 
-    _, JustParsedComma, _ -> Error(Nil)
+    [#(tok, loc), ..], JustParsedComma, _ ->
+      Error(ParseError(
+        loc,
+        "Expected escaped or non-escaped string after comma, found: " <> token.to_lexeme(
+          tok,
+        ),
+      ))
 
     // If we just parsed a new line, we're expecting an escaped or non-escaped string
-    [Textdata(str), ..remaining_tokens], JustParsedNewline, llf ->
+    [#(Textdata(str), _), ..remaining_tokens], JustParsedNewline, llf ->
       parse_p(remaining_tokens, JustParsedField, [[str], ..llf])
 
-    [Doublequote, ..remaining_tokens], JustParsedNewline, [
+    [#(Doublequote, _), ..remaining_tokens], JustParsedNewline, [
       curr_line,
       ..previously_parsed_lines
     ] ->
@@ -104,11 +130,17 @@ fn parse_p(
         ..previously_parsed_lines
       ])
 
-    _, JustParsedNewline, _ -> Error(Nil)
+    [#(tok, loc), ..], JustParsedNewline, _ ->
+      Error(ParseError(
+        loc,
+        "Expected escaped or non-escaped string after newline, found: " <> token.to_lexeme(
+          tok,
+        ),
+      ))
 
     // If we're inside an escaped string, we can take anything until we get a double quote,
     // but a double double quote "" escapes the double quote and we keep parsing
-    [Doublequote, Doublequote, ..remaining_tokens], InsideEscapedString, [
+    [#(Doublequote, _), #(Doublequote, _), ..remaining_tokens], InsideEscapedString, [
       [str, ..rest_curr_line],
       ..previously_parsed_lines
     ] ->
@@ -117,10 +149,10 @@ fn parse_p(
         ..previously_parsed_lines
       ])
 
-    [Doublequote, ..remaining_tokens], InsideEscapedString, llf ->
+    [#(Doublequote, _), ..remaining_tokens], InsideEscapedString, llf ->
       parse_p(remaining_tokens, JustParsedField, llf)
 
-    [other_token, ..remaining_tokens], InsideEscapedString, [
+    [#(other_token, _), ..remaining_tokens], InsideEscapedString, [
       [str, ..rest_curr_line],
       ..previously_parsed_lines
     ] ->
@@ -130,6 +162,7 @@ fn parse_p(
       ])
 
     // Anything else is an error
-    _, _, _ -> Error(Nil)
+    [#(tok, loc), ..], _, _ ->
+      Error(ParseError(loc, "Unexpected token: " <> token.to_lexeme(tok)))
   }
 }
diff --git a/src/gsv/internal/token.gleam b/src/gsv/internal/token.gleam
@@ -19,6 +19,10 @@ pub type CsvToken {
   Textdata(inner: String)
 }
 
+pub type Location {
+  Location(line: Int, column: Int)
+}
+
 pub fn to_lexeme(token: CsvToken) -> String {
   case token {
     Comma -> ","
@@ -29,6 +33,16 @@ pub fn to_lexeme(token: CsvToken) -> String {
   }
 }
 
+fn len(token: CsvToken) -> Int {
+  case token {
+    Comma -> 1
+    LF -> 1
+    CR -> 1
+    Doublequote -> 1
+    Textdata(str) -> string.length(str)
+  }
+}
+
 pub fn scan(input: String) -> List(CsvToken) {
   input
   |> string.to_utf_codepoints
@@ -49,3 +63,41 @@ pub fn scan(input: String) -> List(CsvToken) {
   })
   |> list.reverse
 }
+
+pub fn with_location(input: List(CsvToken)) -> List(#(CsvToken, Location)) {
+  do_with_location(input, [], Location(1, 1))
+  |> list.reverse
+}
+
+fn do_with_location(
+  input: List(CsvToken),
+  acc: List(#(CsvToken, Location)),
+  curr_loc: Location,
+) -> List(#(CsvToken, Location)) {
+  let Location(line, column) = curr_loc
+  case input {
+    // Base case, no more tokens
+    [] -> acc
+
+    // A newline, increment line number
+    [LF, ..rest] -> {
+      do_with_location(rest, [#(LF, curr_loc), ..acc], Location(line + 1, 1))
+    }
+    [CR, LF, ..rest] -> {
+      do_with_location(
+        rest,
+        [#(LF, Location(line, column + 1)), #(CR, curr_loc), ..acc],
+        Location(line + 1, 1),
+      )
+    }
+
+    // Any other token just increment the column
+    [token, ..rest] -> {
+      do_with_location(
+        rest,
+        [#(token, curr_loc), ..acc],
+        Location(line, column + len(token)),
+      )
+    }
+  }
+}
diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
@@ -1,7 +1,9 @@
 import gleeunit
 import gleeunit/should
-import gsv/internal/token.{CR, Comma, Doublequote, LF, Textdata, scan}
-import gsv/internal/ast.{parse}
+import gsv/internal/token.{
+  CR, Comma, Doublequote, LF, Location, Textdata, scan, with_location,
+}
+import gsv/internal/ast.{ParseError, parse}
 import gsv.{Unix, Windows}
 import gleam/list
 import gleam/result
@@ -31,6 +33,7 @@ pub fn scan_test() {
 pub fn parse_test() {
   "Ben, 25,\" TRUE\n\r\"\"\"\nAustin, 25, FALSE"
   |> scan
+  |> with_location
   |> parse
   |> should.equal(
     Ok([["Ben", " 25", " TRUE\n\r\""], ["Austin", " 25", " FALSE"]]),
@@ -40,7 +43,9 @@ pub fn parse_test() {
 pub fn parse_empty_string_fail_test() {
   ""
   |> scan
+  |> with_location
   |> parse
+  |> result.nil_error
   |> should.equal(Error(Nil))
 }
 
@@ -140,3 +145,31 @@ pub fn for_the_readme_test() {
   |> gsv.from_lists(separator: ",", line_ending: Windows)
   |> should.equal("Hello, World\r\nGoodbye, Mars")
 }
+
+pub fn error_cases_test() {
+  let produce_error = fn(csv_str) {
+    case
+      csv_str
+      |> scan
+      |> with_location
+      |> parse
+    {
+      Ok(_) -> panic as "Expected an error"
+      Error(ParseError(loc, msg)) -> #(loc, msg)
+    }
+  }
+
+  produce_error("Ben, 25,, TRUE")
+  |> should.equal(#(
+    Location(1, 9),
+    "Expected escaped or non-escaped string after comma, found: ,",
+  ))
+  produce_error("Austin, 25, FALSE\n\"Ben Peinhardt\", 25,, TRUE")
+  |> should.equal(#(
+    Location(2, 21),
+    "Expected escaped or non-escaped string after comma, found: ,",
+  ))
+}
+// pub fn totally_panics_test() {
+//   "Ben, 25,, TRUE" |> gsv.to_lists_or_panic
+// }