diff --git a/docs/Project.toml b/docs/Project.toml
index 044a380b59..50ae5c6c31 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -69,7 +69,7 @@ IncompleteLU = "0.2"
 Integrals = "4"
 LineSearches = "7"
 LinearSolve = "2"
-Lux = "0.5"
+Lux = "1"
 LuxCUDA = "0.3"
 MCMCChains = "6"
 Measurements = "2"
@@ -78,12 +78,12 @@ ModelingToolkit = "9.9"
 MultiDocumenter = "0.7"
 NeuralPDE = "5.15"
 NonlinearSolve = "3"
-Optimization = "3"
-OptimizationMOI = "0.4"
-OptimizationNLopt = "0.2"
-OptimizationOptimJL = "0.2, 0.3"
-OptimizationOptimisers = "0.2"
-OptimizationPolyalgorithms = "0.2"
+Optimization = "4"
+OptimizationMOI = "0.5"
+OptimizationNLopt = "0.3"
+OptimizationOptimJL = "0.4"
+OptimizationOptimisers = "0.3"
+OptimizationPolyalgorithms = "0.3"
 OrdinaryDiffEq = "6"
 Plots = "1"
 SciMLExpectations = "2"
diff --git a/docs/make.jl b/docs/make.jl
index 04b614b937..de3a9faf26 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -25,7 +25,8 @@ makedocs(sitename = "Overview of Julia's SciML",
         "https://epubs.siam.org/doi/10.1137/0903023",
         "https://bkamins.github.io/julialang/2020/12/24/minilanguage.html",
         "https://arxiv.org/abs/2109.06786",
-        "https://arxiv.org/abs/2001.04385"],
+        "https://arxiv.org/abs/2001.04385",
+        "https://code.visualstudio.com/"],
     format = Documenter.HTML(assets = ["assets/favicon.ico"],
         canonical = "https://docs.sciml.ai/stable/",
         mathengine = mathengine),
diff --git a/docs/src/getting_started/fit_simulation.md b/docs/src/getting_started/fit_simulation.md
index d5631c28f6..4287307f7d 100644
--- a/docs/src/getting_started/fit_simulation.md
+++ b/docs/src/getting_started/fit_simulation.md
@@ -99,12 +99,14 @@ function loss(newp)
     newprob = remake(prob, p = newp)
     sol = solve(newprob, saveat = 1)
     loss = sum(abs2, sol .- xy_data)
-    return loss, sol
+    return loss
 end
 
 # Define a callback function to monitor optimization progress
-function callback(p, l, sol)
+function callback(state, l)
     display(l)
+    newprob = remake(prob, p = state.u)
+    sol = solve(newprob, saveat = 1)
     plt = plot(sol, ylim = (0, 6), label = ["Current x Prediction" "Current y Prediction"])
     scatter!(plt, t_data, xy_data', label = ["x Data" "y Data"])
     display(plt)
@@ -278,21 +280,13 @@ function loss(newp)
     newprob = remake(prob, p = newp)
     sol = solve(newprob, saveat = 1)
     l = sum(abs2, sol .- xy_data)
-    return l, sol
+    return l
 end
 ```
 
-Notice that our loss function returns the loss value as the first return,
-but returns extra information (the ODE solution with the new parameters)
-as an extra return argument.
-We will explain why this extra return information is helpful in the next section.
-
 ### Step 5: Solve the Optimization Problem
 
 This step will look very similar to [the first optimization tutorial](@ref first_opt),
-except now we have a new loss function `loss` which returns both the loss value
-and the associated ODE solution.
-(In the previous tutorial, `L` only returned the loss value.)
 The `Optimization.solve` function can accept an optional callback function
 to monitor the optimization process using extra arguments returned from `loss`.
 
@@ -300,15 +294,14 @@ The callback syntax is always:
 
 ```
 callback(
-    optimization variables,
+    state,
     the current loss value,
-    other arguments returned from the loss function, ...
 )
 ```
 
-In this case, we will provide the callback the arguments `(p, l, sol)`,
-since it always takes the current state of the optimization first (`p`)
-then the returns from the loss function (`l, sol`).
+In this case, we will provide the callback the arguments `(state, l)`,
+since it always takes the current state of the optimization first (`state`)
+then the current loss value (`l`).
 The return value of the callback function should default to `false`.
 `Optimization.solve` will halt if/when the callback function returns `true` instead.
 Typically the `return` statement would monitor the loss value
@@ -318,8 +311,10 @@ More details about callbacks in Optimization.jl can be found
 [here](https://docs.sciml.ai/Optimization/stable/API/solve/).
 
 ```@example odefit
-function callback(p, l, sol)
+function callback(state, l)
     display(l)
+    newprob = remake(prob, p = state.u)
+    sol = solve(newprob, saveat = 1)
     plt = plot(sol, ylim = (0, 6), label = ["Current x Prediction" "Current y Prediction"])
     scatter!(plt, t_data, xy_data', label = ["x Data" "y Data"])
     display(plt)
@@ -327,7 +322,7 @@ function callback(p, l, sol)
 end
 ```
 
-With this callback function, every step of the optimization will display both the loss value and a plot of how the solution compares to the training data.
+With this callback function, every step of the optimization will display both the loss value and a plot of how the solution compares to the training data. Since we want to track the fit visually we plot the simulation at each iteration and compare it to the data. This is expensive since it requires an extra `solve` call and a plotting step for each iteration.
 
 Now, just like [the first optimization tutorial](@ref first_opt),
 we set up our `OptimizationFunction` and `OptimizationProblem`,
diff --git a/docs/src/highlevels/modeling_languages.md b/docs/src/highlevels/modeling_languages.md
index 576759575b..e685b6c91a 100644
--- a/docs/src/highlevels/modeling_languages.md
+++ b/docs/src/highlevels/modeling_languages.md
@@ -50,7 +50,7 @@ doing standard molecular dynamics approximations.
 
 ## DiffEqFinancial.jl: Financial models for use in the DifferentialEquations ecosystem
 
-The goal of [DiffEqFinancial.jl](https://github.com/SciML/DiffEqFinancial.jl/commits/master) is to be a feature-complete set
+The goal of [DiffEqFinancial.jl](https://github.com/SciML/DiffEqFinancial.jl/) is to be a feature-complete set
 of solvers for the types of problems found in libraries like QuantLib, such as the Heston process or the
 Black-Scholes model.
 
diff --git a/docs/src/showcase/blackhole.md b/docs/src/showcase/blackhole.md
index d3b90dc892..c993753558 100644
--- a/docs/src/showcase/blackhole.md
+++ b/docs/src/showcase/blackhole.md
@@ -490,10 +490,8 @@ function loss(NN_params)
         prob_nn, RK4(), u0 = u0, p = NN_params, saveat = tsteps, dt = dt, adaptive = false))
     pred_waveform = compute_waveform(dt_data, pred, mass_ratio, model_params)[1]
 
-    loss = (sum(abs2,
-        view(waveform, obs_to_use_for_training) .-
-        view(pred_waveform, obs_to_use_for_training)))
-    return loss, pred_waveform
+    loss = ( sum(abs2, view(waveform,obs_to_use_for_training) .- view(pred_waveform,obs_to_use_for_training) ) )
+    return loss
 end
 ```
 
@@ -508,10 +506,11 @@ We'll use the following callback to save the history of the loss values.
 ```@example ude
 losses = []
 
-callback(θ, l, pred_waveform; doplot = true) = begin
+callback(state, l; doplot = true) = begin
     push!(losses, l)
     #=  Disable plotting as it trains since in docs
     display(l)
+    waveform = compute_waveform(dt_data, soln, mass_ratio, model_params)[1]
     # plot current prediction against data
     plt = plot(tsteps, waveform,
         markershape=:circle, markeralpha = 0.25,
diff --git a/docs/src/showcase/gpu_spde.md b/docs/src/showcase/gpu_spde.md
index f6d3db65d0..fe43d74b4a 100644
--- a/docs/src/showcase/gpu_spde.md
+++ b/docs/src/showcase/gpu_spde.md
@@ -302,7 +302,7 @@ These last two ways enclose the pointer to our cache arrays locally but still pr
 function f(du,u,p,t) to the ODE solver.
 
 Now, since PDEs are large, many times we don't care about getting the whole timeseries. Using
-the [output controls from DifferentialEquations.jl](https://diffeq.sciml.ai/latest/basics/common_solver_opts.html#Output-Control-1), we can make it only output the final timepoint.
+the [output controls from DifferentialEquations.jl](https://docs.sciml.ai/DiffEqDocs/stable/basics/common_solver_opts/), we can make it only output the final timepoint.
 
 ```julia
 prob = ODEProblem(f, u0, (0.0, 100.0))
diff --git a/docs/src/showcase/missing_physics.md b/docs/src/showcase/missing_physics.md
index f10ff8b94c..393ad2b70d 100644
--- a/docs/src/showcase/missing_physics.md
+++ b/docs/src/showcase/missing_physics.md
@@ -222,7 +222,7 @@ current loss:
 ```@example ude
 losses = Float64[]
 
-callback = function (p, l)
+callback = function (state, l)
     push!(losses, l)
     if length(losses) % 50 == 0
         println("Current loss after $(length(losses)) iterations: $(losses[end])")
diff --git a/docs/src/showcase/pinngpu.md b/docs/src/showcase/pinngpu.md
index 0875715cbc..29c130e6b2 100644
--- a/docs/src/showcase/pinngpu.md
+++ b/docs/src/showcase/pinngpu.md
@@ -148,7 +148,7 @@ prob = discretize(pde_system, discretization)
 ## Step 6: Solve the Optimization Problem
 
 ```@example pinn
-callback = function (p, l)
+callback = function (state, l)
     println("Current loss is: $l")
     return false
 end
diff --git a/docs/src/showcase/symbolic_analysis.md b/docs/src/showcase/symbolic_analysis.md
index b95fff2599..13f996ae0a 100644
--- a/docs/src/showcase/symbolic_analysis.md
+++ b/docs/src/showcase/symbolic_analysis.md
@@ -118,7 +118,7 @@ Did you implement the DAE incorrectly? No. Is the solver broken? No.
 
 It turns out that this is a property of the DAE that we are attempting to solve.
 This kind of DAE is known as an index-3 DAE. For a complete discussion of DAE
-index, see [this article](https://www.scholarpedia.org/article/Differential-algebraic_equations).
+index, see [this article](http://www.scholarpedia.org/article/Differential-algebraic_equations).
 Essentially, the issue here is that we have 4 differential variables (``x``, ``v_x``, ``y``, ``v_y``)
 and one algebraic variable ``T`` (which we can know because there is no `D(T)`
 term in the equations). An index-1 DAE always satisfies that the Jacobian of