JuliaStats · palday · Sep 20, 2022 · Sep 20, 2022 · Sep 21, 2022 · Sep 21, 2022
diff --git a/src/glmfit.jl b/src/glmfit.jl
@@ -731,3 +731,37 @@ function checky(y, d::Binomial)
     end
     return nothing
 end
+
+# need to add :pearson
+const _RESIDUAL_TYPES = [:deviance, :response, :working]
+
+"""
+    residuals(model::GeneralizedLinearModel; type=:deviance)
+
+Return the residuals of a GLM.
+
+Supported residual types are:
+- `:response`: the difference between the observed and fitted values
+- `:deviance`: the signed square root of the elementwise
+                contribution to the deviance
+- `:working`: working residuals (used during the IRLS process)
+"""
+function residuals(model::GeneralizedLinearModel; type=:deviance)
+    type in _RESIDUAL_TYPES ||
+        throw(ArgumentError("Unsupported type `$(type)``; supported types are" *
+                            "$(_RESIDUAL_TYPES)"))
+    # TODO: add in optimized method for normal with identity linnk
+    res = if type == :response
+        response(model) - fitted(model)
+    elseif type == :deviance
+        # XXX I think this might be the same as
+        # 2 * wrkresid, but I'm not 100% sure if that holds across families
+        sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid)
+    elseif type == :working
+        return model.rr.wrkresid
+    else
+        error("An error has occurred. Please file an issue on GitHub.")
+    end
+
+    return res
+end
diff --git a/src/glmtools.jl b/src/glmtools.jl
@@ -514,7 +514,7 @@ function loglik_obs end
 loglik_obs(::Bernoulli, y, μ, wt, ϕ) = wt*logpdf(Bernoulli(μ), y)
 loglik_obs(::Binomial, y, μ, wt, ϕ) = logpdf(Binomial(Int(wt), μ), Int(y*wt))
 loglik_obs(::Gamma, y, μ, wt, ϕ) = wt*logpdf(Gamma(inv(ϕ), μ*ϕ), y)
-# In Distributions.jl, a Geometric distribution characterizes the number of failures before 
+# In Distributions.jl, a Geometric distribution characterizes the number of failures before
 # the first success in a sequence of independent Bernoulli trials with success rate p.
 # The mean of Geometric distribution is (1 - p) / p.
 # Hence, p = 1 / (1 + μ).

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -51,22 +51,22 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y
 end
 
 @testset "Linear Model Cook's Distance" begin
-    st_df = DataFrame( 
+    st_df = DataFrame(
         Y=[6.4, 7.4, 10.4, 15.1, 12.3 , 11.4],
         XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5],
-        XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], 
+        XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8],
         XC=[3., 13., 23., 39.8, 34., 31.],
         # values from SAS proc reg
-        CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], 
-        CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], 
+        CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932],
+        CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538],
         CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157],
         )
 
     # linear regression
     t_lm_base = lm(@formula(Y ~ XA), st_df)
     @test isapprox(st_df.CooksD_base, cooksdistance(t_lm_base))
 
-    # linear regression, no intercept 
+    # linear regression, no intercept
     t_lm_noint = lm(@formula(Y ~ XA +0), st_df)
     @test isapprox(st_df.CooksD_noint, cooksdistance(t_lm_noint))
 
@@ -82,7 +82,7 @@ end
 
 end
 
-@testset "linear model with weights" begin 
+@testset "linear model with weights" begin
     df = dataset("quantreg", "engel")
     N = nrow(df)
     df.weights = repeat(1:5, Int(N/5))
@@ -94,13 +94,17 @@ end
     @test isapprox(stderror(lm_model), [9.382302620120193, 0.00816741377772968])
     @test isapprox(r2(lm_model), 0.8330258148644486)
     @test isapprox(adjr2(lm_model), 0.832788298242634)
-    @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813; 
+    @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813;
                                     -0.06772589439264813 6.670664781664879e-5])
     @test isapprox(first(predict(lm_model)), 357.57694841780994)
     @test isapprox(loglikelihood(lm_model), -4353.946729075838)
     @test isapprox(loglikelihood(glm_model), -4353.946729075838)
     @test isapprox(nullloglikelihood(lm_model), -4984.892139711452)
-    @test isapprox(mean(residuals(lm_model)), -5.412966629787718) 
+    @test isapprox(mean(residuals(lm_model)), -5.412966629787718)
+
+    # this should be elementwise true (which is a stronger condition than
+    # vectors being approximately equal) the so we test elementwise
+    @test all(residuals(glm_model; type=:response) .≈ residuals(lm_model))
 end
 
 @testset "rankdeficient" begin
@@ -315,6 +319,27 @@ admit.rank = categorical(admit.rank)
     @test isapprox(coef(gm2),
         [-3.9899786606380756, 0.0022644256521549004, 0.804037453515578,
          -0.6754428594116578, -1.340203811748108, -1.5514636444657495])
+    res = residuals(gm2; type=:deviance)
+    # values from R
+    @test isapprox(res[1:10],
+                   [-0.6156283, 1.568695, 0.7787919, 1.856779, -0.5019254,
+                    1.410201, 1.318558, -0.6994666, 1.792076, -1.207922]; atol=1e-6)
+    @test isapprox(res[390:400],
+                   [-1.015303, 1.352001, 1.199244, 1.734904, 1.283653, 1.656921,
+                    -1.158223, -0.6015442, -0.6320556, -1.116244, -0.8458358]; atol=1e-6)
+    res = residuals(gm2; type=:response)
+    # values from R
+    @test isapprox(res[1:10],
+                   [-0.1726265, 0.707825, 0.2615918, 0.8216154, -0.1183539,
+                    0.6300301, 0.5807538, -0.2170033, 0.7992648, -0.5178682], atol=1e-6)
+    @test isapprox(res[390:400],
+                   [-0.4027505, 0.5990641, 0.512806, 0.7779709, 0.5612748,
+                    0.7465767, -0.48867, -0.1655043, -0.1810622, -0.4636674, -0.3007306]; atol=1e-6)
+    # less extensive test here because it's just grabbing the IRLS values, which are already
+    # extensively tested
+    @test isapprox(residuals(gm2; type=:working)[1:10],
+                   [-1.208644, 3.422607, 1.354264, 5.605865, -1.134242,
+                    2.702922, 2.385234, -1.277145, 4.981688, -2.074122]; atol=1e-5)
 end
 
 @testset "Bernoulli ProbitLink" begin
@@ -707,8 +732,8 @@ end
     @test aic(gm22) ≈ 1112.7422553284146
     @test aicc(gm22) ≈ 1113.7933502189255
     @test bic(gm22) ≈ 1136.6111083020812
-    @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235, 
-                            -0.4497584898742737, 0.08622664933901254, 0.3558996662512287, 
+    @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235,
+                            -0.4497584898742737, 0.08622664933901254, 0.3558996662512287,
                              0.29016080736927813]
     @test stderror(gm22) ≈ [0.22754287093719366, 0.15274755092180423, 0.15928431669166637,
                             0.23853372776980591, 0.2354231414867577, 0.24750780320597515,
@@ -823,7 +848,7 @@ end
     gm11 = fit(GeneralizedLinearModel, X, Y, Binomial())
     @test isapprox(predict(gm11), Y)
     @test predict(gm11) == fitted(gm11)
-    
+
     newX = rand(rng, 5, 2)
     newY = logistic.(newX * coef(gm11))
     gm11_pred1 = predict(gm11, newX)
@@ -1055,7 +1080,7 @@ end
             ──────────────────────────────────────────────────────────────────
                  DOF  ΔDOF     SSR    ΔSSR       R²      ΔR²        F*   p(>F)
             ──────────────────────────────────────────────────────────────────
-            [1]    3        0.1283           0.9603                           
+            [1]    3        0.1283           0.9603
             [2]    2    -1  3.2292  3.1008  -0.0000  -0.9603  241.6234  <1e-07
             ──────────────────────────────────────────────────────────────────"""
     else
@@ -1064,7 +1089,7 @@ end
             ─────────────────────────────────────────────────────────────────
                  DOF  ΔDOF     SSR    ΔSSR       R²      ΔR²        F*  p(>F)
             ─────────────────────────────────────────────────────────────────
-            [1]    3        0.1283           0.9603                          
+            [1]    3        0.1283           0.9603
             [2]    2    -1  3.2292  3.1008  -0.0000  -0.9603  241.6234  <1e-7
             ─────────────────────────────────────────────────────────────────"""
     end
@@ -1078,7 +1103,7 @@ end
             ──────────────────────────────────────────────────────────────────
                  DOF  ΔDOF     SSR     ΔSSR       R²     ΔR²        F*   p(>F)
             ──────────────────────────────────────────────────────────────────
-            [1]    2        3.2292           -0.0000                          
+            [1]    2        3.2292           -0.0000
             [2]    3     1  0.1283  -3.1008   0.9603  0.9603  241.6234  <1e-07
             ──────────────────────────────────────────────────────────────────"""
     else
@@ -1087,7 +1112,7 @@ end
             ─────────────────────────────────────────────────────────────────
                  DOF  ΔDOF     SSR     ΔSSR       R²     ΔR²        F*  p(>F)
             ─────────────────────────────────────────────────────────────────
-            [1]    2        3.2292           -0.0000                         
+            [1]    2        3.2292           -0.0000
             [2]    3     1  0.1283  -3.1008   0.9603  0.9603  241.6234  <1e-7
             ─────────────────────────────────────────────────────────────────"""
     end
@@ -1103,7 +1128,7 @@ end
             ──────────────────────────────────────────────────────────────────
                  DOF  ΔDOF     SSR     ΔSSR       R²     ΔR²        F*   p(>F)
             ──────────────────────────────────────────────────────────────────
-            [1]    2        3.2292           -0.0000                          
+            [1]    2        3.2292           -0.0000
             [2]    3     1  0.1283  -3.1008   0.9603  0.9603  241.6234  <1e-07
             [3]    5     2  0.1017  -0.0266   0.9685  0.0082    1.0456  0.3950
             ──────────────────────────────────────────────────────────────────"""
@@ -1113,7 +1138,7 @@ end
             ──────────────────────────────────────────────────────────────────
                  DOF  ΔDOF     SSR     ΔSSR       R²     ΔR²        F*   p(>F)
             ──────────────────────────────────────────────────────────────────
-            [1]    2        3.2292           -0.0000                          
+            [1]    2        3.2292           -0.0000
             [2]    3     1  0.1283  -3.1008   0.9603  0.9603  241.6234   <1e-7
             [3]    5     2  0.1017  -0.0266   0.9685  0.0082    1.0456  0.3950
             ──────────────────────────────────────────────────────────────────"""
@@ -1129,7 +1154,7 @@ end
             ──────────────────────────────────────────────────────────────────
                  DOF  ΔDOF     SSR    ΔSSR       R²      ΔR²        F*   p(>F)
             ──────────────────────────────────────────────────────────────────
-            [1]    5        0.1017           0.9685                           
+            [1]    5        0.1017           0.9685
             [2]    3    -2  0.1283  0.0266   0.9603  -0.0082    1.0456  0.3950
             [3]    2    -1  3.2292  3.1008  -0.0000  -0.9603  241.6234  <1e-07
             ──────────────────────────────────────────────────────────────────"""
@@ -1139,7 +1164,7 @@ end
             ──────────────────────────────────────────────────────────────────
                  DOF  ΔDOF     SSR    ΔSSR       R²      ΔR²        F*   p(>F)
             ──────────────────────────────────────────────────────────────────
-            [1]    5        0.1017           0.9685                           
+            [1]    5        0.1017           0.9685
             [2]    3    -2  0.1283  0.0266   0.9603  -0.0082    1.0456  0.3950
             [3]    2    -1  3.2292  3.1008  -0.0000  -0.9603  241.6234   <1e-7
             ──────────────────────────────────────────────────────────────────"""