Merge pull request #88 from FluxML/dev

For a 0.1.7 release
FluxML · Feb 23, 2021 · 03f17d2 · 03f17d2
2 parents 601c38d + 3ce61c2
commit 03f17d2
Show file tree

Hide file tree

Showing 11 changed files with 159 additions and 33 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -0,0 +1,16 @@
+steps:
+  - label: "Julia v1"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1:
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    timeout_in_minutes: 60
+
+env:
+  JULIA_PKG_SERVER: "" # it often struggles with our large artifacts
+  # SECRET_CODECOV_TOKEN: ""
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJFlux"
 uuid = "094fc8d1-fd35-5302-93ea-dabda2abf845"
 authors = ["Anthony D. Blaom <[email protected]>", "Ayush Shridhar <[email protected]>"]
-version = "0.1.6"
+version = "0.1.7"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"

diff --git a/README.md b/README.md
@@ -4,7 +4,22 @@ An interface to the Flux deep learning models for the
 [MLJ](https://github.com/alan-turing-institute/MLJ.jl) machine
 learning framework
 
-[![Build Status](https://github.com/alan-turing-institute/MLJFlux.jl/workflows/CI/badge.svg)](https://github.com/alan-turing-institute/MLJFlux.jl/actions) [![Coverage Status](https://coveralls.io/repos/github/alan-turing-institute/MLJFlux.jl/badge.svg?branch=master)](https://coveralls.io/github/alan-turing-institute/MLJFlux.jl?branch=master)
+| Branch   | Julia | CPU CI | GPU CI | Coverage |
+| -------- | ----- | ------ | -----  | -------- |
+| `master` | v1    | [![Continuous Integration (CPU)][gha-img-master]][gha-url] | [![Continuous Integration (GPU)][buildkite-julia1-img-master]][buildkite-url] | [![Code Coverage][coveralls-img-master]][coveralls-url] |
+| `dev`    | v1    | [![Continuous Integration (CPU)][gha-img-dev]][gha-url] | [![Continuous Integration (GPU)][buildkite-julia1-img-dev]][buildkite-url] | [![Code Coverage][coveralls-img-dev]][coveralls-url] |
+
+[gha-img-master]: https://github.com/FluxML/MLJFlux.jl/workflows/CI/badge.svg?branch=master "Continuous Integration (CPU)"
+[gha-img-dev]: https://github.com/FluxML/MLJFlux.jl/workflows/CI/badge.svg?branch=dev "Continuous Integration (CPU)"
+[gha-url]: https://github.com/FluxML/MLJFlux.jl/actions/workflows/ci.yml
+
+[buildkite-julia1-img-master]: https://badge.buildkite.com/ae439e1f6ed6f178342a0ed166d0983de6ec1b72325e4e3e7e.svg?branch=master&step=Julia%20v1 "Continuous Integration (GPU)"
+[buildkite-julia1-img-dev]: https://badge.buildkite.com/ae439e1f6ed6f178342a0ed166d0983de6ec1b72325e4e3e7e.svg?branch=dev&step=Julia%20v1 "Continuous Integration (GPU)"
+[buildkite-url]: https://buildkite.com/julialang/mljflux-dot-jl
+
+[coveralls-img-master]: https://coveralls.io/repos/github/alan-turing-institute/MLJFlux.jl/badge.svg?branch=master "Code Coverage"
+[coveralls-img-dev]: https://coveralls.io/repos/github/alan-turing-institute/MLJFlux.jl/badge.svg?branch=dev "Code Coverage"
+[coveralls-url]: https://github.com/FluxML/MLJFlux.jl/actions/workflows/ci.yml
 
 MLJFlux makes it possible to apply the machine learning
 meta-algorithms provided by MLJ - such as out-of-sample performance
@@ -203,7 +218,7 @@ All models share the following hyper-parameters:
 
 7. `alpha`: The L2/L1 mix of regularization. Default = 0. Range = [0, 1]
 
-8. `acceleration`: Use `CUDALibs()` for training on GPU; default is `CPU1()`. 
+8. `acceleration`: Use `CUDALibs()` for training on GPU; default is `CPU1()`.
 
 9. `optimiser_changes_trigger_retraining`: True if fitting an
    associated machine should trigger retraining from scratch whenever
@@ -244,7 +259,7 @@ function MLJFlux.build(nn::MyNetwork, n_in, n_out)
 end
 ```
 
-Note here that `n_in` and `n_out` depend on the size of the data (see 
+Note here that `n_in` and `n_out` depend on the size of the data (see
 Table 1).
 
 More generally, defining a new builder means defining a new struct
@@ -298,7 +313,7 @@ you *should* use MLJ loss functions in MLJ meta-algorithms.
 
 We define a builder that builds a chain with six alternating
 convolution and max-pool layers, and a final dense layer, which we
-apply to the MNIST image dataset. 
+apply to the MNIST image dataset.
 
 First we define a generic builder (working for any image size, color
 or gray):
@@ -390,4 +405,3 @@ julia> evaluate!(mach,
 │ misclassification_rate │ 0.0467        │ [0.0467]   │
 └────────────────────────┴───────────────┴────────────┘
 ```
-
diff --git a/src/classifier.jl b/src/classifier.jl
@@ -67,7 +67,7 @@ function MLJModelInterface.fit(model::NeuralNetworkClassifier,
                           data,
                           model.acceleration)
 
-    cache = (deepcopy(model), data, history, n_input, n_output)
+    cache = (deepcopy(model), data, history, n_input, n_output, optimiser)
     fitresult = (chain, levels)
     report = (training_losses=history, )
 
@@ -90,7 +90,7 @@ function MLJModelInterface.update(model::NeuralNetworkClassifier,
                                   X,
                                   y)
 
-    old_model, data, old_history, n_input, n_output = old_cache
+    old_model, data, old_history, n_input, n_output, optimiser = old_cache
     old_chain, levels = old_fitresult
 
     optimiser_flag = model.optimiser_changes_trigger_retraining &&
@@ -109,7 +109,12 @@ function MLJModelInterface.update(model::NeuralNetworkClassifier,
         epochs = model.epochs
     end
 
-    optimiser = deepcopy(model.optimiser)
+    # we only get to keep the optimiser "state" carried over from
+    # previous training if we're doing a warm restart and the user has not
+    # changed the optimiser hyper-parameter:
+    if !keep_chain || model.optimiser != old_model.optimiser
+        optimiser = deepcopy(model.optimiser)
+    end
 
     chain, history = fit!(chain,
                           optimiser,
@@ -126,7 +131,7 @@ function MLJModelInterface.update(model::NeuralNetworkClassifier,
     end
 
     fitresult = (chain, levels)
-    cache = (deepcopy(model), data, history, n_input, n_output)
+    cache = (deepcopy(model), data, history, n_input, n_output, optimiser)
     report = (training_losses=history, )
 
     return fitresult, cache, report

diff --git a/src/core.jl b/src/core.jl
@@ -1,6 +1,6 @@
 ## EXPOSE OPTIMISERS TO MLJ (for eg, tuning)
 
-# Here we: (i) Make the optimiser structs "transarent" so that their
+# Here we: (i) Make the optimiser structs "transparent" so that their
 # field values are exposed by calls to MLJ.params; and (ii) Overload
 # `==` for optimisers, so that we can detect when their parameters
 # remain unchanged on calls to MLJModelInterface.update methods.
@@ -134,7 +134,7 @@ function  fit!(chain, optimiser, loss, epochs,
         verbosity != 1 || next!(meter)
 
     end
-    
+
     return Flux.cpu(chain), history
 
 end

diff --git a/src/image.jl b/src/image.jl
@@ -73,7 +73,9 @@ function MLJModelInterface.fit(model::ImageClassifier,
                           data,
                           model.acceleration)
 
-    cache = deepcopy(model), data, history, n_input, n_output
+    # `optimiser` is now mutated
+
+    cache = (deepcopy(model), data, history, n_input, n_output, optimiser)
     fitresult = (chain, levels)
 
     report = (training_losses=history, )
@@ -96,7 +98,7 @@ function MLJModelInterface.update(model::ImageClassifier,
                                   X,
                                   y)
 
-    old_model, data, old_history, n_input, n_output = old_cache
+    old_model, data, old_history, n_input, n_output, optimiser = old_cache
     old_chain, levels = old_fitresult
 
     optimiser_flag = model.optimiser_changes_trigger_retraining &&
@@ -120,7 +122,12 @@ function MLJModelInterface.update(model::ImageClassifier,
         epochs = model.epochs
     end
 
-    optimiser = deepcopy(model.optimiser)
+    # we only get to keep the optimiser "state" carried over from
+    # previous training if we're doing a warm restart and the user has not
+    # changed the optimiser hyper-parameter:
+    if !keep_chain || model.optimiser != old_model.optimiser
+        optimiser = deepcopy(model.optimiser)
+    end
 
     chain, history = fit!(chain,
                           optimiser,
@@ -137,7 +144,7 @@ function MLJModelInterface.update(model::ImageClassifier,
     end
 
     fitresult = (chain, levels)
-    cache = (deepcopy(model), data, history, n_input, n_output)
+    cache = (deepcopy(model), data, history, n_input, n_output, optimiser)
     report = (training_losses=history, )
 
     return fitresult, cache, report

diff --git a/src/regressor.jl b/src/regressor.jl
@@ -20,7 +20,7 @@ function NeuralNetworkRegressor(; builder::B   = Linear()
                                 , optimiser_changes_trigger_retraining=false
                                 , acceleration  = CPU1()
                                 ) where {B,O,L}
-    
+
     model = NeuralNetworkRegressor{B,O,L}(builder
                                           , optimiser
                                           , loss
@@ -30,10 +30,10 @@ function NeuralNetworkRegressor(; builder::B   = Linear()
                                           , alpha
                                           , optimiser_changes_trigger_retraining
                                           , acceleration)
-    
+
    message = clean!(model)
    isempty(message) || @warn message
-    
+
     return model
 end
 
@@ -75,7 +75,7 @@ function MultitargetNeuralNetworkRegressor(; builder::B   = Linear()
 
     return model
 end
-    
+
 const Regressor =
     Union{NeuralNetworkRegressor, MultitargetNeuralNetworkRegressor}
 
@@ -107,7 +107,9 @@ function MLJModelInterface.fit(model::Regressor, verbosity::Int, X, y)
                           data,
                           model.acceleration)
 
-    cache = (deepcopy(model), data, history, n_input, n_output)
+    # note: "state" part of `optimiser` is now mutated!
+
+    cache = (deepcopy(model), data, history, n_input, n_output, optimiser)
     fitresult = (chain, target_is_multivariate, target_column_names)
     report = (training_losses=history,)
 
@@ -122,7 +124,11 @@ function MLJModelInterface.update(model::Regressor,
                                   X,
                                   y)
 
-    old_model, data, old_history, n_input, n_output = old_cache
+    # note: the `optimiser` in `old_cache` stores "state" (eg,
+    # momentum); the "state" part of the `optimiser` field of `model`
+    # and of `old_model` play no role
+
+    old_model, data, old_history, n_input, n_output, optimiser = old_cache
     old_chain, target_is_multivariate, target_column_names = old_fitresult
 
     optimiser_flag = model.optimiser_changes_trigger_retraining &&
@@ -140,7 +146,12 @@ function MLJModelInterface.update(model::Regressor,
         epochs = model.epochs
     end
 
-    optimiser = deepcopy(model.optimiser)
+    # we only get to keep the optimiser "state" carried over from
+    # previous training if we're doing a warm restart and the user has not
+    # changed the optimiser hyper-parameter:
+    if !keep_chain || model.optimiser != old_model.optimiser
+        optimiser = deepcopy(model.optimiser)
+    end
 
     chain, history = fit!(chain,
                           optimiser,
@@ -155,8 +166,9 @@ function MLJModelInterface.update(model::Regressor,
         # note: history[1] = old_history[end]
         history = vcat(old_history[1:end-1], history)
     end
+
     fitresult = (chain, target_is_multivariate, target_column_names)
-    cache = (deepcopy(model), data, history, n_input, n_output)
+    cache = (deepcopy(model), data, history, n_input, n_output, optimiser)
     report = (training_losses=history,)
 
     return fitresult, cache, report
@@ -167,16 +179,16 @@ function MLJModelInterface.predict(model::Regressor, fitresult, Xnew_)
 
     chain , target_is_multivariate, target_column_names = fitresult
 
-    Xnew_ = MLJModelInterface.matrix(Xnew_) 
+    Xnew_ = MLJModelInterface.matrix(Xnew_)
 
     if target_is_multivariate
         ypred = [chain(values.(Xnew_[i, :]))
-                 for i in 1:size(Xnew_, 1)] 
+                 for i in 1:size(Xnew_, 1)]
         return MLJModelInterface.table(reduce(hcat, y for y in ypred)',
                                        names=target_column_names)
     else
         return [chain(values.(Xnew_[i, :]))[1]
-                for i in 1:size(Xnew_, 1)] 
+                for i in 1:size(Xnew_, 1)]
     end
 end
 

diff --git a/test/classifier.jl b/test/classifier.jl
@@ -57,6 +57,14 @@ losses = []
     push!(losses, first_last_training_loss[2])
     yhat = MLJBase.predict(mach, rows=test);
     @test mean(MLJBase.cross_entropy(yhat, y[test])) < 0.95*loss_baseline
+
+    optimisertest(MLJFlux.NeuralNetworkClassifier,
+                  X,
+                  y,
+                  builder,
+                  optimiser,
+                  accel)
+
 end
 
 # check different resources (CPU1, CUDALibs, etc)) give about the same loss:

diff --git a/test/image.jl b/test/image.jl
@@ -53,6 +53,9 @@ losses = []
     @test basictest(MLJFlux.ImageClassifier, images, labels,
                            model.builder, model.optimiser, 0.95, accel)
 
+    @test optimisertest(MLJFlux.ImageClassifier, images, labels,
+                           model.builder, model.optimiser, accel)
+
 end
 
 # check different resources (CPU1, CUDALibs, etc)) give about the same loss:
@@ -112,7 +115,7 @@ end
 
 # check different resources (CPU1, CUDALibs, etc)) give about the same loss:
 reference = losses[1]
-@test all(x->abs(x - reference)/reference < 1e-4, losses[2:end])
+@test all(x->abs(x - reference)/reference < 1e-3, losses[2:end])
 
 
 ## BASIC IMAGE TESTS COLOR
@@ -139,7 +142,7 @@ losses = []
     # tests update logic, etc (see test_utililites.jl):
     @test basictest(MLJFlux.ImageClassifier, images, labels,
                            model.builder, model.optimiser, 0.95, accel)
-    
+
     @time fitresult, cache, _report = MLJBase.fit(model, 0, images, labels)
     pred = MLJBase.predict(model, fitresult, images[1:6])
     first_last_training_loss = _report[1][[1, end]]
@@ -153,6 +156,9 @@ losses = []
                                     acceleration=accel)
     fitresult, cache, _report = MLJBase.fit(model, 0, images, labels);
 
+    @test optimisertest(MLJFlux.ImageClassifier, images, labels,
+                           model.builder, model.optimiser, accel)
+
 end
 
 # check different resources (CPU1, CUDALibs, etc)) give about the same loss:

diff --git a/test/regressor.jl b/test/regressor.jl
@@ -17,7 +17,7 @@ train, test = MLJBase.partition(1:N, 0.7)
 @testset_accelerated "NeuralNetworkRegressor" accel begin
 
     Random.seed!(123)
-    
+
     basictest(MLJFlux.NeuralNetworkRegressor,
               X,
               y,
@@ -38,6 +38,14 @@ train, test = MLJBase.partition(1:N, 0.7)
     truth = y[test]
     goal = 0.9*model.loss(truth .- mean(truth), 0)
     @test model.loss(yhat, truth) < goal
+
+    optimisertest(MLJFlux.NeuralNetworkRegressor,
+                  X,
+                  y,
+                  builder,
+                  optimiser,
+                  accel)
+
 end
 
 # check different resources (CPU1, CUDALibs, etc)) give about the same loss:
@@ -53,7 +61,7 @@ losses = []
 @testset_accelerated "MultitargetNeuralNetworkRegressor" accel begin
 
     Random.seed!(123)
-    
+
     basictest(MLJFlux.MultitargetNeuralNetworkRegressor,
               X,
               y,
@@ -69,11 +77,19 @@ losses = []
         fit(model, 0, MLJBase.selectrows(X, train), selectrows(y, train))
     first_last_training_loss = rpt[1][[1, end]]
     push!(losses, first_last_training_loss[2])
-#    @show first_last_training_loss
+#   @show first_last_training_loss
     yhat = predict(model, fitresult, selectrows(X, test))
     truth = ymatrix[test]
     goal = 0.9*model.loss(truth .- mean(truth), 0)
     @test model.loss(Tables.matrix(yhat), truth) < goal
+
+    optimisertest(MLJFlux.MultitargetNeuralNetworkRegressor,
+              X,
+              y,
+              builder,
+              optimiser,
+              accel)
+
 end
 
 # check different resources (CPU1, CUDALibs, etc)) give about the same loss: