diff --git a/docs/src/index.md b/docs/src/index.md index 65fd104f..12ca1f83 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -79,11 +79,14 @@ Using a `CategoricalVector` constructed with `categorical` or `categorical!`: ```jldoctest categorical julia> using CategoricalArrays, DataFrames, GLM, StableRNGs + julia> rng = StableRNG(1); # Ensure example can be reproduced + julia> data = DataFrame(y = rand(rng, 100), x = categorical(repeat([1, 2, 3, 4], 25))); + julia> lm(@formula(y ~ x), data) LinearModel @@ -105,8 +108,10 @@ Using [`contrasts`](https://juliastats.github.io/StatsModels.jl/stable/contrasts ```jldoctest categorical julia> using StableRNGs + julia> data = DataFrame(y = rand(StableRNG(1), 100), x = repeat([1, 2, 3, 4], 25)); + julia> lm(@formula(y ~ x), data, contrasts = Dict(:x => DummyCoding())) LinearModel @@ -130,12 +135,16 @@ which computes an F-test between each pair of subsequent models and reports fit ```jldoctest julia> using DataFrames, GLM, StableRNGs + julia> data = DataFrame(y = (1:50).^2 .+ randn(StableRNG(1), 50), x = 1:50); + julia> ols_lin = lm(@formula(y ~ x), data); + julia> ols_sq = lm(@formula(y ~ x + x^2), data); + julia> ftest(ols_lin.model, ols_sq.model) F-test: 2 models fitted on 50 observations ───────────────────────────────────────────────────────────────────────────────── diff --git a/src/glmfit.jl b/src/glmfit.jl index 3d0ca322..87d1689a 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -840,3 +840,38 @@ function checky(y, d::Binomial) end return nothing end + +const _RESIDUAL_TYPES = [:deviance, :pearson, :response, :working] + +""" + residuals(model::GeneralizedLinearModel; type=:deviance) + +Return the residuals of a GLM. + +Supported values for `type` are: +- `:deviance` (the default): the signed square root of the element-wise + contribution to the deviance +- `:response`: the difference between the observed and fitted values +- `:working`: working residuals (used during the IRLS process) +- `:pearson`: Pearson residuals, i.e., response residuals scaled + by the standard error of the response +""" +function residuals(model::GeneralizedLinearModel; type=:deviance) + type in _RESIDUAL_TYPES || + throw(ArgumentError("Unsupported type `$(type)``; supported types are" * + "$(_RESIDUAL_TYPES)")) + # TODO: add in optimized method for normal with identity link + if type === :response + return response(model) - fitted(model) + elseif type === :deviance + # XXX I think this might be the same as + # 2 * wrkresid, but I'm not 100% sure if that holds across families + return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid) + elseif type === :pearson + return copysign.(model.rr.wrkresid, response(model) .- fitted(model)) .* sqrt.(model.rr.wrkwt) + elseif type === :working + return model.rr.wrkresid + else + error("An error has occurred. Please file an issue on GitHub.") + end +end diff --git a/test/runtests.jl b/test/runtests.jl index cbb21f1b..a7e7d8ce 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,6 @@ +# NB: trailing white space must be preserved in the tests for `show` +# If your editor is set to automatically strip it, this will cause problems + using CategoricalArrays, CSV, DataFrames, LinearAlgebra, SparseArrays, StableRNGs, Statistics, StatsBase, Test, RDatasets using GLM @@ -121,7 +124,11 @@ end @test isapprox(loglikelihood(lm_model), -4353.946729075838) @test isapprox(loglikelihood(glm_model), -4353.946729075838) @test isapprox(nullloglikelihood(lm_model), -4984.892139711452) - @test isapprox(mean(residuals(lm_model)), -5.412966629787718) + @test isapprox(mean(residuals(lm_model)), -5.412966629787718) + + # this should be elementwise true (which is a stronger condition than + # vectors being approximately equal) the so we test elementwise + @test all(residuals(glm_model, type = :working) .≈ residuals(lm_model)) end @testset "Linear model with dropcollinearity and $dmethod" for dmethod in (:cholesky, :qr) @@ -374,6 +381,24 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], @test isapprox(bic(gm1), 57.74744128863877) @test isapprox(coef(gm1)[1:3], [3.044522437723423,-0.45425527227759555,-0.29298712468147375]) + + @test isapprox(residuals(gm1; type=:deviance), + [-0.6712492, 0.9627236, -0.1696466, -0.2199851, -0.9555235, + 1.049386, 0.8471537, -0.09167147, -0.9665637]; + atol=1e-6) + @test isapprox(residuals(gm1; type=:pearson), + [-0.6546537, 1.004158, -0.1684304, -0.2182179, + -0.9128709, 1.094797, 0.8728716, -0.09128709, -0.9263671]; + atol=1e-6) + @test isapprox(residuals(gm1; type=:response), + [-3, 3.666667, -0.6666667, -1, -3.333333, + 4.333333, 4, -0.3333333, -3.666667]; + atol=1e-6) + @test isapprox(residuals(gm1; type=:working), + [-0.1428571, 0.275, -0.04255319, -0.04761905, + -0.25, 0.2765957, 0.1904762, -0.025, -0.2340426]; + atol=1e-6) + # Deprecated methods @test gm1.model === gm1 @test gm1.mf.f == formula(gm1) @@ -384,6 +409,35 @@ admit = CSV.read(joinpath(glm_datadir, "admit.csv"), DataFrame) admit.rank = categorical(admit.rank) @testset "$distr with LogitLink" for distr in (Binomial, Bernoulli) + gm2 = fit(GeneralizedLinearModel, @formula(admit ~ 1 + gre + gpa + rank), admit, distr()) + res = residuals(gm2; type=:deviance) + # values from R + @test isapprox(res[1:10], + [-0.6156283, 1.568695, 0.7787919, 1.856779, -0.5019254, + 1.410201, 1.318558, -0.6994666, 1.792076, -1.207922]; atol=1e-6) + @test isapprox(res[390:400], + [-1.015303, 1.352001, 1.199244, 1.734904, 1.283653, 1.656921, + -1.158223, -0.6015442, -0.6320556, -1.116244, -0.8458358]; atol=1e-6) + res = residuals(gm2; type=:response) + # values from R + @test isapprox(res[1:10], + [-0.1726265, 0.707825, 0.2615918, 0.8216154, -0.1183539, + 0.6300301, 0.5807538, -0.2170033, 0.7992648, -0.5178682]; atol=1e-6) + @test isapprox(res[390:400], + [-0.4027505, 0.5990641, 0.512806, 0.7779709, 0.5612748, + 0.7465767, -0.48867, -0.1655043, -0.1810622, -0.4636674, -0.3007306]; atol=1e-6) + res = residuals(gm2; type=:pearson) + @test isapprox(res[1:10], + [-0.4567757, 1.556473, 0.5952011, 2.146128, -0.3663905, + 1.304961, 1.176959, -0.5264452, 1.995417, -1.036398]; atol=1e-6) + @test isapprox(res[390:399], + [-0.8211834, 1.22236, 1.025949, 1.871874, 1.131075, + 1.716382, -0.977591, -0.4453409, -0.4702063, -0.9297929]; atol=1e-6) + # less extensive test here because it's just grabbing the IRLS values, which are already + # extensively tested + @test isapprox(residuals(gm2; type=:working)[1:10], + [-1.208644, 3.422607, 1.354264, 5.605865, -1.134242, + 2.702922, 2.385234, -1.277145, 4.981688, -2.074122]; atol=1e-5) for dmethod in (:cholesky, :qr) gm2 = fit(GeneralizedLinearModel, @formula(admit ~ 1 + gre + gpa + rank), admit, distr(); method=dmethod) @@ -557,6 +611,22 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), @test isapprox(coef(gm8), [-0.01655438172784895,0.01534311491072141]) @test isapprox(GLM.dispersion(gm8, true), 0.002446059333495581, atol=1e-6) @test isapprox(stderror(gm8), [0.00092754223, 0.000414957683], atol=1e-6) + + @test isapprox(residuals(gm8; type=:response), + [-4.859041, 4.736111, 1.992869, 0.9973619, -1.065779, + 0.02779383, -0.614323, -0.7318223, -0.4831699]; atol=1e-6) + @test isapprox(residuals(gm8; type=:working), + [0.0003219114, -0.001669384, -0.001245099, -0.0008626359, + 0.001353047, -4.456918e-05, 0.001314963, 0.001879625, 0.001414318]; + atol=1e-6) + @test isapprox(residuals(gm8; type=:deviance), + [-0.04008349, 0.08641118, 0.04900896, 0.02904992, + -0.03846595, 0.001112578, -0.02869586, -0.03755713, -0.0263724]; + atol=1e-6) + @test isapprox(residuals(gm8; type=:pearson), + [-0.03954973, 0.08891786, 0.04981284, 0.0293319, + -0.03797433, 0.001112991, -0.02842204, -0.03708843, -0.02614107]; + atol=1e-5) end @testset "InverseGaussian with $dmethod" for dmethod in (:cholesky, :qr) @@ -576,6 +646,21 @@ end @test isapprox(coef(gm8a), [-0.0011079770504295668,0.0007219138982289362]) @test isapprox(GLM.dispersion(gm8a, true), 0.0011008719709455776, atol=1e-6) @test isapprox(stderror(gm8a), [0.0001675339726910311,9.468485015919463e-5], atol=1e-6) + + @test isapprox(residuals(gm8a; type=:response), + [-18.21078, 15.52523, 7.639634, 4.20793, -0.2428536, + -0.3585344, -2.263445, -3.056904, -3.240284]; atol=1e-5) + @test isapprox(residuals(gm8a; type=:working), + [1.441199e-05, -0.0004052051, -0.0003766424, -0.0002882584, 2.402242e-05, + 4.397323e-05, 0.0003595653, 0.0005697418, 0.0006762887]; + atol=1e-6) + @test isapprox(residuals(gm8a; type=:deviance), + [-0.01230767, 0.04799467, 0.03430759, 0.02309913, -0.001715577, + -0.002827721, -0.02123177, -0.03179512, -0.0359572]; + atol=1e-6) + @test isapprox(residuals(gm8a; type=:pearson), + [-0.01145543, 0.05608432, 0.03793027, 0.02462693, -0.001707913, + -0.00280766, -0.02017246, -0.02950971, -0.03310111]; atol=1e-6) end @testset "Gamma LogLink with $dmethod" for dmethod in (:cholesky, :qr) @@ -834,6 +919,27 @@ end @test aicc(gm23) ≈ aicc(gm24) @test bic(gm23) ≈ bic(gm24) @test predict(gm23) ≈ predict(gm24) + + @test isapprox(residuals(gm23; type=:deviance)[1:10], + [-1.691255, -0.706297, -0.519149, -0.961134, -0.961134, + -0.234178, 0.17945, 0.279821, -0.803948, -0.803948]; + atol=1e-6) + @test isapprox(residuals(gm23; type=:pearson)[1:10], + [-0.902477, -0.550709, -0.433453, -0.680948, -0.680948, + -0.216258, 0.190345, 0.306518, -0.604398, -0.604398]; + atol=1e-5) + @test isapprox(residuals(gm23; type=:response)[1:10], + [-23.089885, -14.089885, -11.089885, -11.723055, -11.723055, + -3.723055, 3.276945, 5.276945, -9.919, -9.919]; + atol=1e-5) + @test isapprox(residuals(gm23; type=:working)[1:10], + [0.03668, 0.022383, 0.017617, 0.041919, 0.041919, + 0.013313, -0.011718, -0.018869, 0.039141, 0.039141]; + atol=1e-6) + + for rtype in GLM._RESIDUAL_TYPES + @test isapprox(residuals.([gm23, gm24]; type=rtype)...; atol=1e-6) + end end @testset "GLM with no intercept with Cholesky" begin @@ -1367,6 +1473,8 @@ end ft1a = ftest(mod, nullmod) @test isnan(ft1a.pval[1]) @test ft1a.pval[2] ≈ 2.481215056713184e-8 + # NB: trailing white space must be preserved in the tests for `show` + # If your editor is set to automatically strip it, this will cause problems if VERSION >= v"1.6.0" @test sprint(show, ft1a) == """ F-test: 2 models fitted on 12 observations