v2

bcpeinhardt · Sep 24, 2024 · 117880f · 117880f
1 parent 0dc40ea
commit 117880f
Show file tree

Hide file tree

Showing 6 changed files with 158 additions and 72 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## Unreleased
 
+## v2.0.0 - 24 September 2024
+- Now there are four public functions, `to_lists`, `to_dicts`, `from_lists` and `from_dicts`.
+
 ## v1.4.0 - 29 March 2024
 - Fix bug where trailing comma was causing error
 

diff --git a/README.md b/README.md
@@ -3,11 +3,8 @@
 [![Package Version](https://img.shields.io/hexpm/v/gsv)](https://hex.pm/packages/gsv)
 [![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/gsv/)
 
-This is a simple csv parser and writer for gleam. It will get more performant in the future,
-but if you're looking for high performance now, I'd recommend doing ffi to an existing parser
-in your target runtime.
-
-We are using the grammar from [rfc 4180](https://datatracker.ietf.org/doc/html/rfc4180#section-2)
+This is a simple csv parser and writer for Gleam. It will get more performant/battle tested in the future,
+but if you're looking for that now, I'd recommend doing ffi to an existing parser in your target runtime.
 
 #### Example
 
@@ -23,6 +20,14 @@ pub fn main() {
   // Write a List(List(String)) to a CSV string
   let csv_str = records
   |> gsv.from_lists(separator: ",", line_ending: Windows)
+
+  // Parse a CSV string with headers to a List(Dict(String, String))
+  let assert Ok(records) = gsv.to_dicts(csv_str)
+  // => [ dict.from_list([ #("Hello", "Goodbye"), #("World", "Mars") ]) ]
+
+  // Write a List(Dict(String, String)) to a CSV string, treating the keys as the header row
+  let csv_str = records
+    |> gsv.from_dicts(separator: ",", line_ending: Windows)
 }
 ```
 

diff --git a/gleam.toml b/gleam.toml
@@ -1,5 +1,5 @@
 name = "gsv"
-version = "1.4.0"
+version = "2.0.0"
 gleam = ">= 0.32.0"
 description = "A simple csv parser and generator written in gleam "
 

diff --git a/src/gsv.gleam b/src/gsv.gleam
@@ -1,51 +1,18 @@
+import gleam/dict.{type Dict}
 import gleam/int
 import gleam/list
+import gleam/pair
 import gleam/result
 import gleam/string
 import gsv/internal/ast.{ParseError}
 import gsv/internal/token.{Location}
 
-/// Parses a csv string to a list of lists of strings.
-/// Automatically handles Windows and Unix line endings.
-pub fn to_lists(input: String) -> Result(List(List(String)), Nil) {
-  input
-  |> token.scan
-  |> token.with_location
-  |> ast.parse
-  |> result.nil_error
-}
-
-/// Parses a csv string to a list of lists of strings.
-/// Automatically handles Windows and Unix line endings.
-/// Panics with an error msg if the string is not valid csv.
-pub fn to_lists_or_panic(input: String) -> List(List(String)) {
-  let res =
-    input
-    |> token.scan
-    |> token.with_location
-    |> ast.parse
-
-  case res {
-    Ok(lol) -> lol
-    Error(ParseError(Location(line, column), msg)) -> {
-      panic as {
-        "["
-        <> "line "
-        <> int.to_string(line)
-        <> " column "
-        <> int.to_string(column)
-        <> "] of csv: "
-        <> msg
-      }
-      [[]]
-    }
-  }
-}
-
 /// Parses a csv string to a list of lists of strings.
 /// Automatically handles Windows and Unix line endings.
 /// Returns a string error msg if the string is not valid csv.
-pub fn to_lists_or_error(input: String) -> Result(List(List(String)), String) {
+/// Unquoted strings are trimmed, while quoted strings have leading and trailing 
+/// whitespace preserved.
+pub fn to_lists(input: String) -> Result(List(List(String)), String) {
   input
   |> token.scan
   |> token.with_location
@@ -62,6 +29,43 @@ pub fn to_lists_or_error(input: String) -> Result(List(List(String)), String) {
   })
 }
 
+/// Parses a csv string to a list of dicts. 
+/// Automatically handles Windows and Unix line endings.
+/// Returns a string error msg if the string is not valid csv.
+/// Unquoted strings are trimmed, while quoted strings have leading and trailing 
+/// whitespace preserved.
+/// Whitespace only or empty strings are not valid headers and will be ignored. 
+/// Whitespace only or empty strings are not considered "present" in the csv row and 
+/// are not inserted into the row dict. 
+pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), String) {
+  use lol <- result.try(to_lists(input))
+  case lol {
+    [] -> []
+    [headers, ..rows] -> {
+      let headers =
+        list.index_fold(headers, dict.new(), fn(acc, x, i) {
+          case string.trim(x) == "" {
+            True -> acc
+            False -> dict.insert(acc, i, x)
+          }
+        })
+
+      list.map(rows, fn(row) {
+        use acc, x, i <- list.index_fold(row, dict.new())
+        case dict.get(headers, i) {
+          Error(Nil) -> acc
+          Ok(h) ->
+            case string.trim(x) {
+              "" -> acc
+              t -> dict.insert(acc, string.trim(h), t)
+            }
+        }
+      })
+    }
+  }
+  |> Ok
+}
+
 /// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows"
 /// line endings. Use with the `from_lists` function when 
 /// writing to a csv string.
@@ -107,3 +111,37 @@ pub fn from_lists(
   |> list.map(fn(row) { string.join(row, separator) })
   |> string.join(le_to_string(line_ending))
 }
+
+/// Takes a list of dicts and writes it to a csv string.
+/// Will automatically escape strings that contain double quotes or
+/// line endings with double quotes (in csv, double quotes get escaped by doing
+/// a double doublequote)
+/// The string `he"llo\n` becomes `"he""llo\n"`
+pub fn from_dicts(
+  input: List(Dict(String, String)),
+  separator separator: String,
+  line_ending line_ending: LineEnding,
+) -> String {
+  case input {
+    [] -> ""
+    [first_row, ..] -> {
+      let headers =
+        first_row
+        |> dict.to_list
+        |> list.map(pair.first)
+        |> list.sort(string.compare)
+
+      let rows =
+        list.map(input, fn(row) {
+          row
+          |> dict.to_list
+          |> list.sort(fn(lhs, rhs) {
+            string.compare(pair.first(lhs), pair.first(rhs))
+          })
+          |> list.map(pair.second)
+        })
+
+      from_lists([headers, ..rows], separator, line_ending)
+    }
+  }
+}
diff --git a/src/gsv/internal/ast.gleam b/src/gsv/internal/ast.gleam
@@ -10,6 +10,7 @@
 
 import gleam/list
 import gleam/result
+import gleam/string
 import gsv/internal/token.{
   type CsvToken, type Location, CR, Comma, Doublequote, LF, Location, Textdata,
 }
@@ -53,7 +54,7 @@ fn parse_p(
 
     // File should begin with either Escaped or Nonescaped string
     [#(Textdata(str), _), ..remaining_tokens], Beginning, [] ->
-      parse_p(remaining_tokens, JustParsedField, [[str]])
+      parse_p(remaining_tokens, JustParsedField, [[string.trim(str)]])
 
     [#(Doublequote, _), ..remaining_tokens], Beginning, [] ->
       parse_p(remaining_tokens, InsideEscapedString, [[""]])
@@ -94,39 +95,44 @@ fn parse_p(
     // (indicating an empty string)
     [#(Textdata(str), _), ..remaining_tokens],
       JustParsedComma,
-      [curr_line, ..previously_parsed_lines] ->
+      [curr_line, ..previously_parsed_lines]
+    ->
       parse_p(remaining_tokens, JustParsedField, [
-        [str, ..curr_line],
+        [string.trim(str), ..curr_line],
         ..previously_parsed_lines
       ])
 
     [#(Doublequote, _), ..remaining_tokens],
       JustParsedComma,
-      [curr_line, ..previously_parsed_lines] ->
+      [curr_line, ..previously_parsed_lines]
+    ->
       parse_p(remaining_tokens, InsideEscapedString, [
         ["", ..curr_line],
         ..previously_parsed_lines
       ])
 
     [#(Comma, _), ..remaining_tokens],
       JustParsedComma,
-      [curr_line, ..previously_parsed_lines] ->
+      [curr_line, ..previously_parsed_lines]
+    ->
       parse_p(remaining_tokens, JustParsedComma, [
         ["", ..curr_line],
         ..previously_parsed_lines
       ])
 
     [#(CR, _), ..remaining_tokens],
       JustParsedComma,
-      [curr_line, ..previously_parsed_lines] ->
+      [curr_line, ..previously_parsed_lines]
+    ->
       parse_p(remaining_tokens, JustParsedCR, [
         ["", ..curr_line],
         ..previously_parsed_lines
       ])
 
     [#(LF, _), ..remaining_tokens],
       JustParsedComma,
-      [curr_line, ..previously_parsed_lines] ->
+      [curr_line, ..previously_parsed_lines]
+    ->
       parse_p(remaining_tokens, JustParsedNewline, [
         ["", ..curr_line],
         ..previously_parsed_lines
@@ -141,11 +147,12 @@ fn parse_p(
 
     // If we just parsed a new line, we're expecting an escaped or non-escaped string
     [#(Textdata(str), _), ..remaining_tokens], JustParsedNewline, llf ->
-      parse_p(remaining_tokens, JustParsedField, [[str], ..llf])
+      parse_p(remaining_tokens, JustParsedField, [[string.trim(str)], ..llf])
 
     [#(Doublequote, _), ..remaining_tokens],
       JustParsedNewline,
-      [curr_line, ..previously_parsed_lines] ->
+      [curr_line, ..previously_parsed_lines]
+    ->
       parse_p(remaining_tokens, InsideEscapedString, [
         ["", ..curr_line],
         ..previously_parsed_lines
@@ -162,7 +169,8 @@ fn parse_p(
     // but a double double quote "" escapes the double quote and we keep parsing
     [#(Doublequote, _), #(Doublequote, _), ..remaining_tokens],
       InsideEscapedString,
-      [[str, ..rest_curr_line], ..previously_parsed_lines] ->
+      [[str, ..rest_curr_line], ..previously_parsed_lines]
+    ->
       parse_p(remaining_tokens, InsideEscapedString, [
         [str <> "\"", ..rest_curr_line],
         ..previously_parsed_lines
@@ -173,7 +181,8 @@ fn parse_p(
 
     [#(other_token, _), ..remaining_tokens],
       InsideEscapedString,
-      [[str, ..rest_curr_line], ..previously_parsed_lines] ->
+      [[str, ..rest_curr_line], ..previously_parsed_lines]
+    ->
       parse_p(remaining_tokens, InsideEscapedString, [
         [str <> token.to_lexeme(other_token), ..rest_curr_line],
         ..previously_parsed_lines