[inferno-ml] Change input/output representation (#122)

Changes the internal representation of `InferenceParam` inputs/outputs. Instead of having two separate `inputs` and `outputs` as vectors, they are now both in a single field and annotated with a `ScriptInputType`, which controls readability/writability of the input. Also, instead of being a `Vector`, they are now explicitly mapped to Inferno `Ident`s The rationale for the merging `inputs`/`outputs`: - We need to provide outputs as well as inputs as arguments during script evaluation. If we don't do this, the identifiers used in `makeWrites` will not resolve to anything - We can't just concatenate the old `inputs` and `outputs` fields, because users may wish to have identically named inputs and outputs; for example, they may want to both read from and write to `input0`. This would lead to the following during script eval: ``` fun input0 input0 -> ... ``` - Accordingly, we need some way of allowing identically named identifiers to be both inputs and outputs. Combining the two argument types and storing if they are readable, writable, or both solves this The rationale for storing the Inferno identifiers: - When scripts are created (e.g. via an Inferno LSP server), the only thing that is provided is the list of Inferno `Ident`s - These `[Ident]` need to correspond exactly to the `inputs` that the `InferenceParam` contains - Even if we sort this list of identifiers, we need to make sure that the order of the `InferenceParam`'s inputs is exactly the same as the order of the original `[Ident]`. This could lead to bug-prone assumptions or workarounds, since we would't have the original `[Ident]` in the param - To make sure that we always know which order is correct, we can store the `Ident`s along with the actual inputs in a `Map` and then use `Map.toAscList` to get the correct order for script arguments Also I fixed the JSON encoding/decoding so that NaNs can be transmitted to/from `inferno-ml-server`. Previously, `[null, 0.0, 0.0]` would parse to `[IEmpty, IDouble 0.0, IDouble 0.0]`, which is of course incorrect. Now it correctly parses to `[IDouble NaN, IDouble 0.0, IDouble 0.0]`
plow-technologies · May 30, 2024 · 52f28b6 · 52f28b6
1 parent 98f8344
commit 52f28b6
Show file tree

Hide file tree

Showing 14 changed files with 190 additions and 107 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -36,6 +36,7 @@ jobs:
         with:
           install_url: https://releases.nixos.org/nix/nix-2.13.3/install
           extra_nix_config: |
+            fallback = true
             substituters = https://cache.nixos.org https://cache.iog.io
             trusted-public-keys = cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY= hydra.iohk.io:f/Ea+s+dFdN+3Y/G+FDgSq+a5NEWhJGzdjvKNGv0/EQ=
             narinfo-cache-negative-ttl = 60

diff --git a/inferno-ml-server-types/CHANGELOG.md b/inferno-ml-server-types/CHANGELOG.md
@@ -1,6 +1,9 @@
 # Revision History for inferno-ml-server-types
 *Note*: we use https://pvp.haskell.org/ (MAJOR.MAJOR.MINOR.PATCH)
 
+## 0.4.0
+* Change representation of script inputs/outputs
+
 ## 0.3.0
 * Add support for tracking evaluation info
 

diff --git a/inferno-ml-server-types/inferno-ml-server-types.cabal b/inferno-ml-server-types/inferno-ml-server-types.cabal
@@ -1,6 +1,6 @@
 cabal-version: 2.4
 name:          inferno-ml-server-types
-version:       0.3.0
+version:       0.4.0
 synopsis:      Types for Inferno ML server
 description:   Types for Inferno ML server
 homepage:      https://github.com/plow-technologies/inferno.git#readme

diff --git a/inferno-ml-server-types/src/Inferno/ML/Server/Types.hs b/inferno-ml-server-types/src/Inferno/ML/Server/Types.hs
@@ -17,7 +17,7 @@ import Conduit (ConduitT)
 import Control.Applicative (asum, optional)
 import Control.Category ((>>>))
 import Control.DeepSeq (NFData (rnf), rwhnf)
-import Control.Monad (void)
+import Control.Monad (void, (<=<))
 import Data.Aeson
 import Data.Aeson.Types (Parser)
 import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec
@@ -64,6 +64,7 @@ import Database.PostgreSQL.Simple.Types
   )
 import Foreign.C (CUInt (CUInt))
 import GHC.Generics (Generic)
+import Inferno.Types.Syntax (Ident)
 import Inferno.Types.VersionControl
   ( VCObjectHash,
     byteStringToVCObjectHash,
@@ -172,8 +173,9 @@ newtype Id a = Id Int64
 
 -- | Row for the table containing inference script closures
 data InferenceScript uid gid = InferenceScript
-  { -- NOTE: This is the ID for each row, stored as a `bytea` (bytes of the hash)
+  { -- | This is the ID for each row, stored as a @bytea@ (bytes of the hash)
     hash :: VCObjectHash,
+    -- | Script closure
     obj :: VCMeta uid gid VCObject
   }
   deriving stock (Show, Eq, Generic)
@@ -237,6 +239,8 @@ data Model uid gid = Model
     -- | The user who owns the model, if any. Note that owning a model
     -- will implicitly set permissions
     user :: Maybe uid,
+    -- | The time that this model was \"deleted\", if any. For active models,
+    -- this will be @Nothing@
     terminated :: Maybe UTCTime
   }
   deriving stock (Show, Eq, Generic)
@@ -341,6 +345,8 @@ data ModelVersion uid gid c = ModelVersion
     -- the PSQL large object table
     contents :: c,
     version :: Version,
+    -- | The time that this model version was \"deleted\", if any. For active
+    -- models versions, this will be @Nothing@
     terminated :: Maybe UTCTime
   }
   deriving stock (Show, Eq, Generic)
@@ -592,13 +598,24 @@ data InferenceParam uid gid p s = InferenceParam
     -- (e.g. a UUID for use with @inferno-lsp@)
     --
     -- For existing inference params, this is the foreign key for the specific
-    -- script in the 'InferenceScript' table
+    -- script in the 'InferenceScript' table (i.e. a @VCObjectHash@)
     script :: s,
     -- | This needs to be linked to a specific version of a model rather
     -- than the @model@ table itself
     model :: Id (ModelVersion uid gid Oid),
-    inputs :: Vector (SingleOrMany p),
-    outputs :: Vector (SingleOrMany p),
+    -- | This is called @inputs@ but is also used for script outputs as
+    -- well. The access (input or output) is controlled by the 'ScriptInputType'.
+    -- For example, if this field is set to @[("input0", Single (p, Readable))]@,
+    -- the script will only have a single read-only input and will not be able to
+    -- write anywhere (note that we should disallow this scenario, as script
+    -- evaluation would not work properly)
+    --
+    -- Mapping the input\/output to the Inferno identifier helps ensure that
+    -- Inferno identifiers are always pointing to the correct input\/output;
+    -- otherwise we would need to rely on the order of the original identifiers
+    inputs :: Map Ident (SingleOrMany p, ScriptInputType),
+    -- | The time that this parameter was \"deleted\", if any. For active
+    -- parameters, this will be @Nothing@
     terminated :: Maybe UTCTime,
     user :: uid
   }
@@ -619,10 +636,6 @@ instance
       <$> field
       <*> fmap wrappedTo (field @VCObjectHashRow)
       <*> field
-      -- HACK / FIXME This is a pretty awful hack (storing as `jsonb`),
-      -- but Postgres sub-arrays need to be the same length and writing
-      -- a custom parser might be painful
-      <*> fmap getAeson field
       <*> fmap getAeson field
       <*> field
       <*> field
@@ -638,13 +651,41 @@ instance
     [ toField Default,
       ip ^. the @"script" & VCObjectHashRow & toField,
       ip ^. the @"model" & toField,
-      -- HACK / FIXME See above
       ip ^. the @"inputs" & Aeson & toField,
-      ip ^. the @"outputs" & Aeson & toField,
       toField Default,
       ip ^. the @"user" & toField
     ]
 
+-- | Controls input interaction within a script, i.e. ability to read from
+-- and\/or write to this input. Although the term \"input\" is used, those with
+-- writes enabled can also be described as \"outputs\"
+data ScriptInputType
+  = -- | Script input can be read, but not written
+    Readable
+  | -- | Script input can be written, i.e. can be used in array of
+    -- write objects returned from script evaluation
+    Writable
+  | -- | Script input can be both read from and written to; this allows
+    -- the same script identifier to point to the same PID with both
+    -- types of access enabled
+    ReadableWritable
+  deriving stock (Show, Eq, Generic)
+  deriving anyclass (NFData)
+
+instance FromJSON ScriptInputType where
+  parseJSON = withText "ScriptInputType" $ \case
+    "r" -> pure Readable
+    "w" -> pure Writable
+    "rw" -> pure ReadableWritable
+    s -> fail $ "Invalid script input type: " <> Text.unpack s
+
+instance ToJSON ScriptInputType where
+  toJSON =
+    String . \case
+      Readable -> "r"
+      Writable -> "w"
+      ReadableWritable -> "rw"
+
 -- | Information about execution time and resource usage. This is saved by
 -- @inferno-ml-server@ after script evaluation completes and can be queried
 -- later by using the same job identifier that was provided to the @/inference@
@@ -762,22 +803,42 @@ instance FromJSON IValue where
     Number n -> pure . IDouble $ toRealFloat n
     -- It's easier to just mark the time explicitly in an object,
     -- rather than try to deal with distinguishing times and doubles
-    Object o -> ITime <$> o .: "time"
+    Object o ->
+      asum
+        [ ITime <$> o .: "time",
+          fmap IArray $ arrayP =<< o .: "array"
+        ]
+    -- Note that this preserves a plain JSON array for tuples. But we need
+    -- some straightforward way of distinguishing tuples and arrays; since
+    -- the bridge often transmits a large number of individual tuples (times
+    -- and values), it's better to use arrays for the tuples and a tagged object
+    -- for arrays themselves; we often will only deal with one large array, and
+    -- adding a few bytes to this is better than adding a few bytes to thousands
+    -- of encoded tuples
     Array a
       | [x, y] <- Vector.toList a ->
-          (,) <$> parseJSON x <*> parseJSON y <&> \case
-            -- We don't want to confuse a two-element array of tuples with
-            -- a tuple itself. For example, `"[[10.0, {\"time\": 10}], [10.0, {\"time\": 10}]]"`
-            -- should parse as a two-element array of `(double, time)` tuples,
-            -- not as a `((double, time), (double, time))`. I can't think of
-            -- any reason to support the latter. An alternative would be to
-            -- change tuple encoding to an object, but then we would be transmitting
-            -- a much larger amount of data on most requests
-            (f@(ITuple _), s@(ITuple _)) -> IArray $ Vector.fromList [f, s]
-            t -> ITuple t
-      | otherwise -> IArray <$> traverse (parseJSON @IValue) a
+          fmap ITuple $ (,) <$> parseJSON x <*> parseJSON y
+      | otherwise -> fail "Only two-element tuples are supported"
     Null -> pure IEmpty
-    _ -> fail "Expected one of: string, double, time, tuple, unit (empty array), array"
+    _ -> fail "Expected one of: string, double, time, tuple, null, array"
+    where
+      arrayP :: Vector Value -> Parser (Vector IValue)
+      arrayP a =
+        -- This is a bit tedious, but we want to make sure that the array elements
+        -- are homogeneous; parsing all elements to `IValue`s first can't guarantee
+        -- this
+        asum
+          [ -- This alternative means that `null` will be correctly parsed to NaN
+            -- when inside an array of doubles
+            fmap IDouble <$> traverse parseJSON a,
+            fmap ITuple <$> traverse parseJSON a,
+            fmap IText <$> traverse parseJSON a,
+            fmap ITime <$> traverse (withObject "EpochTime" (.: "time")) a,
+            -- Nested array support
+            fmap IArray
+              <$> traverse (withObject "IArray" (arrayP <=< (.: "array"))) a,
+            fail "Expected a heterogeneous array"
+          ]
 
 instance ToJSON IValue where
   toJSON = \case
@@ -786,8 +847,9 @@ instance ToJSON IValue where
     ITuple t -> toJSON t
     -- See `FromJSON` instance above
     ITime t -> object ["time" .= t]
+    -- See `FromJSON` instance above
+    IArray is -> object ["array" .= is]
     IEmpty -> toJSON Null
-    IArray is -> toJSON is
 
 -- | Used to represent inputs to the script. 'Many' allows for an array input
 data SingleOrMany a

diff --git a/inferno-ml-server/CHANGELOG.md b/inferno-ml-server/CHANGELOG.md
@@ -1,3 +1,6 @@
+## 2023.5.29
+* Change representation of script inputs/outputs
+
 ## 2023.5.22
 * Add support for tracking evaluation info
 

diff --git a/inferno-ml-server/exe/Dummy.hs b/inferno-ml-server/exe/Dummy.hs
@@ -87,7 +87,9 @@ valueAt _ p t =
     <&> maybe IEmpty IDouble . preview (at p . _Just . at t . _Just)
 
 latestValueAndTimeBefore :: Int -> PID -> DummyM IValue
-latestValueAndTimeBefore _ _ = throwIO $ userError "Unsupported"
+latestValueAndTimeBefore _ _ =
+  throwIO $
+    userError "Unsupported: latestValueAndTimeBefore"
 
 valuesBetween :: Int64 -> PID -> Int -> Int -> ReaderT DummyEnv IO IValue
-valuesBetween _ _ _ _ = throwIO $ userError "Unsupported"
+valuesBetween _ _ _ _ = throwIO $ userError "Unsupported: valuesBetween"
diff --git a/inferno-ml-server/exe/ParseAndSave.hs b/inferno-ml-server/exe/ParseAndSave.hs
@@ -13,13 +13,14 @@ module ParseAndSave (main) where
 import Control.Category ((>>>))
 import Control.Exception (Exception (displayException))
 import Control.Monad (void)
+import Data.Aeson (eitherDecode)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString.Char8 as Char8
+import qualified Data.ByteString.Lazy.Char8 as Lazy.Char8
+import Data.Map.Strict (Map)
 import Data.Text (Text)
 import qualified Data.Text.IO as Text.IO
 import Data.Time.Clock.POSIX (getPOSIXTime)
-import Data.Vector (Vector)
-import qualified Data.Vector as Vector
 import Database.PostgreSQL.Simple
   ( Connection,
     Query,
@@ -37,7 +38,7 @@ import Inferno.Core
 import Inferno.ML.Server.Module.Prelude (mkBridgePrelude)
 import Inferno.ML.Server.Types
 import Inferno.ML.Types.Value (customTypes)
-import Inferno.Types.Syntax (Expr, TCScheme)
+import Inferno.Types.Syntax (Expr, Ident, TCScheme)
 import Inferno.Types.VersionControl
   ( Pinned,
     VCObjectHash,
@@ -50,35 +51,37 @@ import Inferno.VersionControl.Types
   )
 import System.Environment (getArgs)
 import System.Exit (die)
-import Text.Read (readMaybe)
 import UnliftIO.Exception (bracket, throwString)
 
 main :: IO ()
 main =
   getArgs >>= \case
-    scriptp : p : conns : _ ->
-      maybe
-        (throwString "Invalid PID")
-        (parseAndSave scriptp (Char8.pack conns) . PID)
-        $ readMaybe p
-    _ -> die "Usage ./parse <SCRIPT-PATH> <PID> <DB-STR>"
-
-parseAndSave :: FilePath -> ByteString -> PID -> IO ()
-parseAndSave p conns pid = do
+    scriptp : pstr : conns : _ ->
+      either throwString (parseAndSave scriptp (Char8.pack conns))
+        . eitherDecode
+        $ Lazy.Char8.pack pstr
+    _ -> die "Usage ./parse <SCRIPT-PATH> <PID-MAP-JSON> <DB-STR>"
+
+parseAndSave ::
+  FilePath ->
+  ByteString ->
+  Map Ident (SingleOrMany PID, ScriptInputType) ->
+  IO ()
+parseAndSave p conns inputs = do
   t <- Text.IO.readFile p
   now <- fromIntegral @Int . round <$> getPOSIXTime
   ast <-
     either (throwString . displayException) pure . (`parse` t)
       =<< mkInferno @_ @BridgeMlValue (mkBridgePrelude funs) customTypes
-  bracket (connectPostgreSQL conns) close (saveScriptAndParam ast now pid)
+  bracket (connectPostgreSQL conns) close (saveScriptAndParam ast now inputs)
 
 saveScriptAndParam ::
   (Expr (Pinned VCObjectHash) (), TCScheme) ->
   CTime ->
-  PID ->
+  Map Ident (SingleOrMany PID, ScriptInputType) ->
   Connection ->
   IO ()
-saveScriptAndParam x now pid conn = insertScript *> insertParam
+saveScriptAndParam x now inputs conn = insertScript *> insertParam
   where
     insertScript :: IO ()
     insertScript =
@@ -96,17 +99,15 @@ saveScriptAndParam x now pid conn = insertScript *> insertParam
         . InferenceParam
           Nothing
           hash
+          -- Bit of a hack. We only have one model version in the
+          -- tests, so we can just hard-code the ID here
           (Id 1)
           inputs
-          mempty
           Nothing
         $ entityIdFromInteger 0
       where
         q :: Query
-        q = [sql| INSERT INTO params VALUES (?, ?, ?, ?, ?, ?, ?) |]
-
-        inputs :: Vector (SingleOrMany PID)
-        inputs = Vector.singleton $ Single pid
+        q = [sql| INSERT INTO params VALUES (?, ?, ?, ?, ?, ?) |]
 
     vcfunc :: VCObject
     vcfunc = uncurry VCFunction x

diff --git a/inferno-ml-server/inferno-ml-server.cabal b/inferno-ml-server/inferno-ml-server.cabal
@@ -1,6 +1,6 @@
 cabal-version:      2.4
 name:               inferno-ml-server
-version:            2023.5.22
+version:            2023.5.29
 synopsis:           Server for Inferno ML
 description:        Server for Inferno ML
 homepage:           https://github.com/plow-technologies/inferno.git#readme
@@ -120,7 +120,7 @@ executable tests
 executable test-client
   import:         common
   main-is:        Client.hs
-  hs-source-dirs: exe
+  hs-source-dirs: test
   ghc-options:    -threaded -rtsopts -main-is Client
   build-depends:
     , aeson
@@ -166,8 +166,10 @@ executable parse-and-save
   hs-source-dirs: exe
   ghc-options:    -threaded -rtsopts -main-is ParseAndSave
   build-depends:
+    , aeson
     , base
     , bytestring
+    , containers
     , inferno-core
     , inferno-ml
     , inferno-ml-server