diff --git a/.gitignore b/.gitignore
index 6018ffb2..4b473d3f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,8 +2,6 @@
 *.jl.*.cov
 *.jl.mem
 *.bson
-.vscode/
-.DS_Store
 
 # manifests
 docs/Manifest.toml
@@ -11,4 +9,3 @@ Manifest.toml
 
 # docs theme
 _flux-theme
-
diff --git a/Project.toml b/Project.toml
index 8a35d732..d4053cbb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -29,3 +29,7 @@ NNlibCUDA = "0.2"
 PartialFunctions = "1"
 julia = "1.6"
 
+[publish]
+ignore = ["^(gh-pages|juliamnt|julia.dmg)$"]
+theme = "_flux-theme"
+title = "Metalhead.jl"
diff --git a/src/Metalhead.jl b/src/Metalhead.jl
index 8dedbd65..26121b47 100644
--- a/src/Metalhead.jl
+++ b/src/Metalhead.jl
@@ -54,6 +54,8 @@ include("convnets/mobilenets/mobilenetv2.jl")
 include("convnets/mobilenets/mobilenetv3.jl")
 include("convnets/mobilenets/mnasnet.jl")
 ## Others
+include("convnets/densenet.jl")
+include("convnets/squeezenet.jl")
 include("convnets/unet.jl")
 ## Hybrid models
 include("convnets/hybrid/convnext.jl")
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index 6ffcd42c..6b890833 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -1,5 +1,5 @@
 """
-    dense_bottleneck(inplanes, outplanes; expansion=4)
+    dense_bottleneck(inplanes, growth_rate)
 
 Create a Densenet bottleneck layer
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -10,7 +10,7 @@ Create a Densenet bottleneck layer
   - `outplanes`: number of output feature maps on bottleneck branch
     (and scaling factor for inner feature maps; see ref)
 """
-function dense_bottleneck(inplanes::Int, outplanes::Int; expansion::Int = 4)
+function dense_bottleneck(inplanes::Integer, outplanes::Integer; expansion::Integer = 4)
     return SkipConnection(Chain(conv_norm((1, 1), inplanes, expansion * outplanes;
                                           revnorm = true)...,
                                 conv_norm((3, 3), expansion * outplanes, outplanes;
@@ -28,7 +28,7 @@ Create a DenseNet transition sequence
   - `inplanes`: number of input feature maps
   - `outplanes`: number of output feature maps
 """
-function transition(inplanes::Int, outplanes::Int)
+function transition(inplanes::Integer, outplanes::Integer)
     return Chain(conv_norm((1, 1), inplanes, outplanes; revnorm = true)...,
                  MeanPool((2, 2)))
 end
@@ -46,7 +46,7 @@ the number of output feature maps by `growth_rates` with each block
   - `growth_rates`: the growth (additive) rates of output feature maps
     after each block (a vector of `k`s from the ref)
 """
-function dense_block(inplanes::Int, growth_rates)
+function dense_block(inplanes::Integer, growth_rates)
     return [dense_bottleneck(i, o)
             for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]),
                               growth_rates)]
@@ -54,7 +54,7 @@ end
 
 """
     densenet(inplanes, growth_rates; reduction = 0.5, dropout_prob = nothing, 
-             inchannels = 3, nclasses = 1000)
+             inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -68,9 +68,9 @@ Create a DenseNet model
   - `dropout_prob`: the dropout probability for the classifier head. Set to `nothing` to disable dropout.
   - `nclasses`: the number of output classes
 """
-function build_densenet(inplanes::Int, growth_rates; reduction = 0.5,
+function build_densenet(inplanes::Integer, growth_rates; reduction = 0.5,
                         dropout_prob = nothing,
-                        inchannels::Int = 3, nclasses::Int = 1000)
+                        inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = []
     append!(layers,
             conv_norm((7, 7), inchannels, inplanes; stride = 2, pad = (3, 3)))
@@ -88,9 +88,9 @@ function build_densenet(inplanes::Int, growth_rates; reduction = 0.5,
 end
 
 """
-    densenet(nblocks::AbstractVector{Int}; growth_rate = 32,
-             reduction = 0.5, dropout_prob = nothing, inchannels = 3,
-             nclasses = 1000)
+    densenet(nblocks::AbstractVector{<:Integer}; growth_rate::Integer = 32,
+             reduction = 0.5, dropout_prob = nothing, inchannels::Integer = 3,
+             nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -104,9 +104,9 @@ Create a DenseNet model
   - `inchannels`: the number of input channels
   - `nclasses`: the number of output classes
 """
-function densenet(nblocks::AbstractVector{Int}; growth_rate::Int = 32,
-                  reduction = 0.5, dropout_prob = nothing, inchannels::Int = 3,
-                  nclasses::Int = 1000)
+function densenet(nblocks::AbstractVector{<:Integer}; growth_rate::Integer = 32,
+                  reduction = 0.5, dropout_prob = nothing, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     return build_densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
                           reduction, dropout_prob, inchannels, nclasses)
 end
@@ -117,8 +117,8 @@ const DENSENET_CONFIGS = Dict(121 => [6, 12, 24, 16],
                               201 => [6, 12, 48, 32])
 
 """
-    DenseNet(config::Int; pretrain = false, growth_rate = 32,
-             reduction = 0.5, inchannels = 3, nclasses = 1000)
+    DenseNet(config::Integer; pretrain::Bool = false, growth_rate::Integer = 32,
+             reduction = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a DenseNet model with specified configuration. Currently supported values are (121, 161, 169, 201)
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -143,8 +143,8 @@ struct DenseNet
 end
 @functor DenseNet
 
-function DenseNet(config::Int; pretrain::Bool = false, growth_rate::Int = 32,
-                  reduction = 0.5, inchannels::Int = 3, nclasses::Int = 1000)
+function DenseNet(config::Integer; pretrain::Bool = false, growth_rate::Integer = 32,
+                  reduction = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(DENSENET_CONFIGS))
     layers = densenet(DENSENET_CONFIGS[config]; growth_rate, reduction, inchannels,
                       nclasses)
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
index 15f8e753..bcf6df58 100644
--- a/src/layers/drop.jl
+++ b/src/layers/drop.jl
@@ -11,7 +11,7 @@ ChainRulesCore.@non_differentiable _dropblock_mask(rng, x, gamma, clipped_block_
 # TODO add experimental `DropBlock` options from timm such as gaussian noise and
 # more precise `DropBlock` to deal with edges (#188)
 """
-    dropblock([rng], x::AbstractArray{T, 4}, drop_block_prob, block_size,
+    dropblock([rng = default_rng_value(x)], x::AbstractArray{T, 4}, drop_block_prob, block_size,
               gamma_scale, active::Bool = true)
 
 The dropblock function. If `active` is `true`, for each input, it zeroes out continguous
@@ -54,7 +54,8 @@ end
 dropblock_mask(rng, x, gamma, bs) = _dropblock_mask(rng, x, gamma, bs)
 
 """
-    DropBlock(drop_block_prob = 0.1, block_size = 7, gamma_scale = 1.0, [rng])
+    DropBlock(drop_block_prob = 0.1, block_size = 7, gamma_scale = 1.0,
+              rng = default_rng_value())
 
 The `DropBlock` layer. While training, it zeroes out continguous regions of
 size `block_size` in the input. During inference, it simply returns the input `x`.
@@ -121,7 +122,7 @@ function Base.show(io::IO, d::DropBlock)
 end
 
 """
-    StochasticDepth(p, mode = :row; [rng])
+    StochasticDepth(p, mode = :row; rng = default_rng_value())
 
 Implements Stochastic Depth. This is a `Dropout` layer from Flux that drops values
 with probability `p`.