From 8625214b43e380c46bc7a80ca664cc9dfb5f6942 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 20 Sep 2022 16:56:48 -0500 Subject: [PATCH 01/29] add residuals method for GLM --- src/glmfit.jl | 34 ++++++++++++++++++++++++++ src/glmtools.jl | 2 +- test/runtests.jl | 63 +++++++++++++++++++++++++++++++++--------------- 3 files changed, 79 insertions(+), 20 deletions(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index d2a7b028..087ead49 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -731,3 +731,37 @@ function checky(y, d::Binomial) end return nothing end + +# need to add :pearson +const _RESIDUAL_TYPES = [:deviance, :response, :working] + +""" + residuals(model::GeneralizedLinearModel; type=:response) + +Return the residuals of a GLM. + +Supported residual types are: +- `:response`: the difference between the observed and fitted values +- `:deviance`: the signed square root of the elementwise + contribution to the deviance +- `:working`: working residuals (used during the IRLS process) +""" +function residuals(model::GeneralizedLinearModel; type=:response) + type in _RESIDUAL_TYPES || + throw(ArgumentError("Unsupported type `$(type)``; supported types are" * + "$(_RESIDUAL_TYPES)")) + # TODO: add in optimized method for normal with identity linnk + res = if type == :response + response(model) - fitted(model) + elseif type == :deviance + # XXX I think this might be the same as + # 2 * wrkresid, but I'm not 100% sure if that holds across families + sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid) + elseif type == :working + return model.rr.wrkresid + else + error("An error has occurred. Please file an issue on GitHub.") + end + + return res +end diff --git a/src/glmtools.jl b/src/glmtools.jl index b6ec0008..2b066ac4 100644 --- a/src/glmtools.jl +++ b/src/glmtools.jl @@ -514,7 +514,7 @@ function loglik_obs end loglik_obs(::Bernoulli, y, μ, wt, ϕ) = wt*logpdf(Bernoulli(μ), y) loglik_obs(::Binomial, y, μ, wt, ϕ) = logpdf(Binomial(Int(wt), μ), Int(y*wt)) loglik_obs(::Gamma, y, μ, wt, ϕ) = wt*logpdf(Gamma(inv(ϕ), μ*ϕ), y) -# In Distributions.jl, a Geometric distribution characterizes the number of failures before +# In Distributions.jl, a Geometric distribution characterizes the number of failures before # the first success in a sequence of independent Bernoulli trials with success rate p. # The mean of Geometric distribution is (1 - p) / p. # Hence, p = 1 / (1 + μ). diff --git a/test/runtests.jl b/test/runtests.jl index b54d9127..88c64ebe 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -51,14 +51,14 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y end @testset "Linear Model Cook's Distance" begin - st_df = DataFrame( + st_df = DataFrame( Y=[6.4, 7.4, 10.4, 15.1, 12.3 , 11.4], XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], - XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], + XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], XC=[3., 13., 23., 39.8, 34., 31.], # values from SAS proc reg - CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], - CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], + CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], + CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157], ) @@ -66,7 +66,7 @@ end t_lm_base = lm(@formula(Y ~ XA), st_df) @test isapprox(st_df.CooksD_base, cooksdistance(t_lm_base)) - # linear regression, no intercept + # linear regression, no intercept t_lm_noint = lm(@formula(Y ~ XA +0), st_df) @test isapprox(st_df.CooksD_noint, cooksdistance(t_lm_noint)) @@ -82,7 +82,7 @@ end end -@testset "linear model with weights" begin +@testset "linear model with weights" begin df = dataset("quantreg", "engel") N = nrow(df) df.weights = repeat(1:5, Int(N/5)) @@ -94,13 +94,17 @@ end @test isapprox(stderror(lm_model), [9.382302620120193, 0.00816741377772968]) @test isapprox(r2(lm_model), 0.8330258148644486) @test isapprox(adjr2(lm_model), 0.832788298242634) - @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813; + @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813; -0.06772589439264813 6.670664781664879e-5]) @test isapprox(first(predict(lm_model)), 357.57694841780994) @test isapprox(loglikelihood(lm_model), -4353.946729075838) @test isapprox(loglikelihood(glm_model), -4353.946729075838) @test isapprox(nullloglikelihood(lm_model), -4984.892139711452) - @test isapprox(mean(residuals(lm_model)), -5.412966629787718) + @test isapprox(mean(residuals(lm_model)), -5.412966629787718) + + # this should be elementwise true (which is a stronger condition than + # vectors being approximately equal) the so we test elementwise + @test all(residuals(glm_model) .≈ residuals(lm_model)) end @testset "rankdeficient" begin @@ -315,6 +319,27 @@ admit.rank = categorical(admit.rank) @test isapprox(coef(gm2), [-3.9899786606380756, 0.0022644256521549004, 0.804037453515578, -0.6754428594116578, -1.340203811748108, -1.5514636444657495]) + res = residuals(gm2; type=:deviance) + # values from R + @test isapprox(res[1:10], + [-0.6156283, 1.568695, 0.7787919, 1.856779, -0.5019254, + 1.410201, 1.318558, -0.6994666, 1.792076, -1.207922]; atol=1e-6) + @test isapprox(res[390:400], + [-1.015303, 1.352001, 1.199244, 1.734904, 1.283653, 1.656921, + -1.158223, -0.6015442, -0.6320556, -1.116244, -0.8458358]; atol=1e-6) + res = residuals(gm2; type=:response) + # values from R + @test isapprox(res[1:10], + [-0.1726265, 0.707825, 0.2615918, 0.8216154, -0.1183539, + 0.6300301, 0.5807538, -0.2170033, 0.7992648, -0.5178682], atol=1e-6) + @test isapprox(res[390:400], + [-0.4027505, 0.5990641, 0.512806, 0.7779709, 0.5612748, + 0.7465767, -0.48867, -0.1655043, -0.1810622, -0.4636674, -0.3007306]; atol=1e-6) + # less extensive test here because it's just grabbing the IRLS values, which are already + # extensively tested + @test isapprox(residuals(gm2; type=:working)[1:10], + [-1.208644, 3.422607, 1.354264, 5.605865, -1.134242, + 2.702922, 2.385234, -1.277145, 4.981688, -2.074122]; atol=1e-5) end @testset "Bernoulli ProbitLink" begin @@ -707,8 +732,8 @@ end @test aic(gm22) ≈ 1112.7422553284146 @test aicc(gm22) ≈ 1113.7933502189255 @test bic(gm22) ≈ 1136.6111083020812 - @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235, - -0.4497584898742737, 0.08622664933901254, 0.3558996662512287, + @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235, + -0.4497584898742737, 0.08622664933901254, 0.3558996662512287, 0.29016080736927813] @test stderror(gm22) ≈ [0.22754287093719366, 0.15274755092180423, 0.15928431669166637, 0.23853372776980591, 0.2354231414867577, 0.24750780320597515, @@ -823,7 +848,7 @@ end gm11 = fit(GeneralizedLinearModel, X, Y, Binomial()) @test isapprox(predict(gm11), Y) @test predict(gm11) == fitted(gm11) - + newX = rand(rng, 5, 2) newY = logistic.(newX * coef(gm11)) gm11_pred1 = predict(gm11, newX) @@ -1055,7 +1080,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 3 0.1283 0.9603 + [1] 3 0.1283 0.9603 [2] 2 -1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-07 ──────────────────────────────────────────────────────────────────""" else @@ -1064,7 +1089,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 3 0.1283 0.9603 + [1] 3 0.1283 0.9603 [2] 2 -1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-7 ─────────────────────────────────────────────────────────────────""" end @@ -1078,7 +1103,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 2 3.2292 -0.0000 + [1] 2 3.2292 -0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-07 ──────────────────────────────────────────────────────────────────""" else @@ -1087,7 +1112,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 2 3.2292 -0.0000 + [1] 2 3.2292 -0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 ─────────────────────────────────────────────────────────────────""" end @@ -1103,7 +1128,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 2 3.2292 -0.0000 + [1] 2 3.2292 -0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-07 [3] 5 2 0.1017 -0.0266 0.9685 0.0082 1.0456 0.3950 ──────────────────────────────────────────────────────────────────""" @@ -1113,7 +1138,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 2 3.2292 -0.0000 + [1] 2 3.2292 -0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 [3] 5 2 0.1017 -0.0266 0.9685 0.0082 1.0456 0.3950 ──────────────────────────────────────────────────────────────────""" @@ -1129,7 +1154,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 5 0.1017 0.9685 + [1] 5 0.1017 0.9685 [2] 3 -2 0.1283 0.0266 0.9603 -0.0082 1.0456 0.3950 [3] 2 -1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-07 ──────────────────────────────────────────────────────────────────""" @@ -1139,7 +1164,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 5 0.1017 0.9685 + [1] 5 0.1017 0.9685 [2] 3 -2 0.1283 0.0266 0.9603 -0.0082 1.0456 0.3950 [3] 2 -1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-7 ──────────────────────────────────────────────────────────────────""" From 30a77ab389d52be4ddff29e1c85883cace7b6c3d Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 20 Sep 2022 17:02:59 -0500 Subject: [PATCH 02/29] set default to deviance residuals --- src/glmfit.jl | 4 ++-- test/runtests.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index 087ead49..b84f2e03 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -736,7 +736,7 @@ end const _RESIDUAL_TYPES = [:deviance, :response, :working] """ - residuals(model::GeneralizedLinearModel; type=:response) + residuals(model::GeneralizedLinearModel; type=:deviance) Return the residuals of a GLM. @@ -746,7 +746,7 @@ Supported residual types are: contribution to the deviance - `:working`: working residuals (used during the IRLS process) """ -function residuals(model::GeneralizedLinearModel; type=:response) +function residuals(model::GeneralizedLinearModel; type=:deviance) type in _RESIDUAL_TYPES || throw(ArgumentError("Unsupported type `$(type)``; supported types are" * "$(_RESIDUAL_TYPES)")) diff --git a/test/runtests.jl b/test/runtests.jl index 88c64ebe..c12f753c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -104,7 +104,7 @@ end # this should be elementwise true (which is a stronger condition than # vectors being approximately equal) the so we test elementwise - @test all(residuals(glm_model) .≈ residuals(lm_model)) + @test all(residuals(glm_model; type=:response) .≈ residuals(lm_model)) end @testset "rankdeficient" begin From 92660bee7d189f5ca30b58155e11b6b978c0b08f Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 21 Sep 2022 09:23:36 -0500 Subject: [PATCH 03/29] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/glmfit.jl | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index b84f2e03..6ff58adb 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -740,28 +740,26 @@ const _RESIDUAL_TYPES = [:deviance, :response, :working] Return the residuals of a GLM. -Supported residual types are: +Supported values for `type` are: +- `:deviance` (the default): the signed square root of the element-wise + contribution to the deviance - `:response`: the difference between the observed and fitted values -- `:deviance`: the signed square root of the elementwise - contribution to the deviance - `:working`: working residuals (used during the IRLS process) """ function residuals(model::GeneralizedLinearModel; type=:deviance) type in _RESIDUAL_TYPES || throw(ArgumentError("Unsupported type `$(type)``; supported types are" * "$(_RESIDUAL_TYPES)")) - # TODO: add in optimized method for normal with identity linnk - res = if type == :response - response(model) - fitted(model) + # TODO: add in optimized method for normal with identity link + if type == :response + return response(model) - fitted(model) elseif type == :deviance # XXX I think this might be the same as # 2 * wrkresid, but I'm not 100% sure if that holds across families - sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid) + return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid) elseif type == :working return model.rr.wrkresid else error("An error has occurred. Please file an issue on GitHub.") end - - return res end From ca6342b2ca0ed50ca29d2308700a1dfa6c983380 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 21 Sep 2022 11:28:55 -0500 Subject: [PATCH 04/29] restore whitespace in f-tests --- test/runtests.jl | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index c12f753c..d000e5f5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,6 @@ +# NB: trailing white space must be preserved in the tests for `show` +# If your editor is set to automatically strip it, this will cause problems + using CategoricalArrays, CSV, DataFrames, LinearAlgebra, SparseArrays, StableRNGs, Statistics, StatsBase, Test, RDatasets using GLM @@ -51,14 +54,14 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y end @testset "Linear Model Cook's Distance" begin - st_df = DataFrame( + st_df = DataFrame( Y=[6.4, 7.4, 10.4, 15.1, 12.3 , 11.4], XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], - XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], + XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], XC=[3., 13., 23., 39.8, 34., 31.], # values from SAS proc reg - CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], - CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], + CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], + CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157], ) @@ -66,7 +69,7 @@ end t_lm_base = lm(@formula(Y ~ XA), st_df) @test isapprox(st_df.CooksD_base, cooksdistance(t_lm_base)) - # linear regression, no intercept + # linear regression, no intercept t_lm_noint = lm(@formula(Y ~ XA +0), st_df) @test isapprox(st_df.CooksD_noint, cooksdistance(t_lm_noint)) @@ -82,7 +85,7 @@ end end -@testset "linear model with weights" begin +@testset "linear model with weights" begin df = dataset("quantreg", "engel") N = nrow(df) df.weights = repeat(1:5, Int(N/5)) @@ -94,7 +97,7 @@ end @test isapprox(stderror(lm_model), [9.382302620120193, 0.00816741377772968]) @test isapprox(r2(lm_model), 0.8330258148644486) @test isapprox(adjr2(lm_model), 0.832788298242634) - @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813; + @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813; -0.06772589439264813 6.670664781664879e-5]) @test isapprox(first(predict(lm_model)), 357.57694841780994) @test isapprox(loglikelihood(lm_model), -4353.946729075838) @@ -732,8 +735,8 @@ end @test aic(gm22) ≈ 1112.7422553284146 @test aicc(gm22) ≈ 1113.7933502189255 @test bic(gm22) ≈ 1136.6111083020812 - @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235, - -0.4497584898742737, 0.08622664933901254, 0.3558996662512287, + @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235, + -0.4497584898742737, 0.08622664933901254, 0.3558996662512287, 0.29016080736927813] @test stderror(gm22) ≈ [0.22754287093719366, 0.15274755092180423, 0.15928431669166637, 0.23853372776980591, 0.2354231414867577, 0.24750780320597515, @@ -848,7 +851,7 @@ end gm11 = fit(GeneralizedLinearModel, X, Y, Binomial()) @test isapprox(predict(gm11), Y) @test predict(gm11) == fitted(gm11) - + newX = rand(rng, 5, 2) newY = logistic.(newX * coef(gm11)) gm11_pred1 = predict(gm11, newX) @@ -1074,13 +1077,15 @@ end ft1a = ftest(mod, nullmod) @test isnan(ft1a.pval[1]) @test ft1a.pval[2] ≈ 2.481215056713184e-8 + # NB: trailing white space must be preserved in the tests for `show` + # If your editor is set to automatically strip it, this will cause problems if VERSION >= v"1.6.0" @test sprint(show, ft1a) == """ F-test: 2 models fitted on 12 observations ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 3 0.1283 0.9603 + [1] 3 0.1283 0.9603 [2] 2 -1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-07 ──────────────────────────────────────────────────────────────────""" else @@ -1089,7 +1094,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 3 0.1283 0.9603 + [1] 3 0.1283 0.9603 [2] 2 -1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-7 ─────────────────────────────────────────────────────────────────""" end @@ -1103,7 +1108,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 2 3.2292 -0.0000 + [1] 2 3.2292 -0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-07 ──────────────────────────────────────────────────────────────────""" else @@ -1112,7 +1117,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 2 3.2292 -0.0000 + [1] 2 3.2292 -0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 ─────────────────────────────────────────────────────────────────""" end @@ -1128,7 +1133,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 2 3.2292 -0.0000 + [1] 2 3.2292 -0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-07 [3] 5 2 0.1017 -0.0266 0.9685 0.0082 1.0456 0.3950 ──────────────────────────────────────────────────────────────────""" @@ -1138,7 +1143,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 2 3.2292 -0.0000 + [1] 2 3.2292 -0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 [3] 5 2 0.1017 -0.0266 0.9685 0.0082 1.0456 0.3950 ──────────────────────────────────────────────────────────────────""" @@ -1154,7 +1159,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 5 0.1017 0.9685 + [1] 5 0.1017 0.9685 [2] 3 -2 0.1283 0.0266 0.9603 -0.0082 1.0456 0.3950 [3] 2 -1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-07 ──────────────────────────────────────────────────────────────────""" @@ -1164,7 +1169,7 @@ end ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ────────────────────────────────────────────────────────────────── - [1] 5 0.1017 0.9685 + [1] 5 0.1017 0.9685 [2] 3 -2 0.1283 0.0266 0.9603 -0.0082 1.0456 0.3950 [3] 2 -1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-7 ──────────────────────────────────────────────────────────────────""" From 66325eb6860dbec05470b713ac90dca76eea4eab Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 21 Sep 2022 12:11:58 -0500 Subject: [PATCH 05/29] add in tests for other families --- test/runtests.jl | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index d000e5f5..8fdcdc61 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -301,6 +301,24 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], @test isapprox(bic(gm1), 57.74744128863877) @test isapprox(coef(gm1)[1:3], [3.044522437723423,-0.45425527227759555,-0.29298712468147375]) + + @test isapprox(residuals(gm1; type=:deviance), + [-0.6712492, 0.9627236, -0.1696466, -0.2199851, -0.9555235, + 1.049386, 0.8471537, -0.09167147, -0.9665637]; + atol=1e-6) + # @test isapprox(residuals(gm1; type=:pearson), + # [-0.6546537, 1.004158, -0.1684304, -0.2182179, + # -0.9128709, 1.094797, 0.8728716, -0.09128709, -0.9263671]; + # atol=1e-6) + @test isapprox(residuals(gm1; type=:response), + [-3, 3.666667, -0.6666667, -1, -3.333333, + 4.333333, 4, -0.3333333, -3.666667]; + atol=1e-6) + @test isapprox(residuals(gm1; type=:working), + [-0.1428571, 0.275, -0.04255319, -0.04761905, + -0.25, 0.2765957, 0.1904762, -0.025, -0.2340426]; + atol=1e-6) + end ## Example from http://www.ats.ucla.edu/stat/r/dae/logit.htm @@ -338,6 +356,13 @@ admit.rank = categorical(admit.rank) @test isapprox(res[390:400], [-0.4027505, 0.5990641, 0.512806, 0.7779709, 0.5612748, 0.7465767, -0.48867, -0.1655043, -0.1810622, -0.4636674, -0.3007306]; atol=1e-6) + # res = residuals(gm2; type=:pearson) + # @test isapprox(res[1:10], + # [-0.4567757, 1.556473, 0.5952011, 2.146128, -0.3663905, + # 1.304961, 1.176959, -0.5264452, 1.995417, -1.036398]; atol=1e-6) + # @test isapprox(res[390:400], + # [-0.8211834, 1.22236, 1.025949, 1.871874, 1.131075, + # 1.716382, -0.977591, -0.4453409, -0.4702063, -0.9297929]; atol=1e-6) # less extensive test here because it's just grabbing the IRLS values, which are already # extensively tested @test isapprox(residuals(gm2; type=:working)[1:10], @@ -500,6 +525,22 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), @test isapprox(coef(gm8), [-0.01655438172784895,0.01534311491072141]) @test isapprox(GLM.dispersion(gm8.model, true), 0.002446059333495581, atol=1e-6) @test isapprox(stderror(gm8), [0.00092754223, 0.000414957683], atol=1e-6) + + @test isapprox(residuals(gm8; type=:response), + [-4.859041, 4.736111, 1.992869, 0.9973619, -1.065779, + 0.02779383, -0.614323, -0.7318223, -0.4831699]; atol=1e-6) + @test isapprox(residuals(gm8; type=:working), + [0.0003219114, -0.001669384, -0.001245099, -0.0008626359, + 0.001353047, -4.456918e-05, 0.001314963, 0.001879625, 0.001414318]; + atol=1e-6) + @test isapprox(residuals(gm8; type=:deviance), + [-0.04008349, 0.08641118, 0.04900896, 0.02904992, + -0.03846595, 0.001112578, -0.02869586, -0.03755713, -0.0263724]; + atol=1e-6) + # @test isapprox(residuals(gm8; type=:pearson), + # [-0.03954973, 0.08891786, 0.04981284, 0.0293319, + # -0.03797433, 0.001112991, -0.02842204, -0.03708843, -0.02614107]; + # atol=1e-6) end @testset "InverseGaussian" begin @@ -518,6 +559,21 @@ end @test isapprox(coef(gm8a), [-0.0011079770504295668,0.0007219138982289362]) @test isapprox(GLM.dispersion(gm8a.model, true), 0.0011008719709455776, atol=1e-6) @test isapprox(stderror(gm8a), [0.0001675339726910311,9.468485015919463e-5], atol=1e-6) + + @test isapprox(residuals(gm8a; type=:response), + [-18.21078, 15.52523, 7.639634, 4.20793, -0.2428536, + -0.3585344, -2.263445, -3.056904, -3.240284]; atol=1e-5) + @test isapprox(residuals(gm8a; type=:working), + [1.441199e-05, -0.0004052051, -0.0003766424, -0.0002882584, 2.402242e-05, + 4.397323e-05, 0.0003595653, 0.0005697418, 0.0006762887]; + atol=1e-6) + @test isapprox(residuals(gm8a; type=:deviance), + [-0.01230767, 0.04799467, 0.03430759, 0.02309913, -0.001715577, + -0.002827721, -0.02123177, -0.03179512, -0.0359572]; + atol=1e-6) + # @test isapprox(residuals(gm8a; type=:pearson), + # [-0.01145543, 0.05608432, 0.03793027, 0.02462693, -0.001707913, + # -0.00280766, -0.02017246, -0.02950971, -0.03310111]; atol=1e-6) end @testset "Gamma LogLink" begin From ffd7c979d3450c103a4384acaa1a7b387eedf6fe Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 28 Sep 2022 22:38:02 -0500 Subject: [PATCH 06/29] update doctests --- docs/src/api.md | 4 ++-- docs/src/examples.md | 6 +++--- docs/src/index.md | 13 +++++++++++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 46990470..20723cf8 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -25,7 +25,7 @@ The most general approach to fitting a model is with the `fit` function, as in julia> using Random julia> fit(LinearModel, hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: Coefficients: ──────────────────────────────────────────────────────────────── @@ -41,7 +41,7 @@ This model can also be fit as julia> using Random julia> lm(hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: Coefficients: ──────────────────────────────────────────────────────────────── diff --git a/docs/src/examples.md b/docs/src/examples.md index 54de17d6..cfa56907 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -20,7 +20,7 @@ julia> data = DataFrame(X=[1,2,3], Y=[2,4,7]) 3 │ 3 7 julia> ols = lm(@formula(Y ~ X), data) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} +StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} Y ~ 1 + X @@ -207,7 +207,7 @@ julia> form = dataset("datasets", "Formaldehyde") 6 │ 0.9 0.782 julia> lm1 = fit(LinearModel, @formula(OptDen ~ Carb), form) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} +StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} OptDen ~ 1 + Carb @@ -256,7 +256,7 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings") 35 rows omitted julia> fm2 = fit(LinearModel, @formula(SR ~ Pop15 + Pop75 + DPI + DDPI), LifeCycleSavings) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} +StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} SR ~ 1 + Pop15 + Pop75 + DPI + DDPI diff --git a/docs/src/index.md b/docs/src/index.md index fb62a0dc..426aa1a5 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -79,13 +79,16 @@ Using a `CategoricalVector` constructed with `categorical` or `categorical!`: ```jldoctest categorical julia> using CategoricalArrays, DataFrames, GLM, StableRNGs + julia> rng = StableRNG(1); # Ensure example can be reproduced + julia> data = DataFrame(y = rand(rng, 100), x = categorical(repeat([1, 2, 3, 4], 25))); + julia> lm(@formula(y ~ x), data) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} +StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} y ~ 1 + x @@ -105,10 +108,12 @@ Using [`contrasts`](https://juliastats.github.io/StatsModels.jl/stable/contrasts ```jldoctest categorical julia> using StableRNGs + julia> data = DataFrame(y = rand(StableRNG(1), 100), x = repeat([1, 2, 3, 4], 25)); + julia> lm(@formula(y ~ x), data, contrasts = Dict(:x => DummyCoding())) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} +StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} y ~ 1 + x @@ -130,12 +135,16 @@ which computes an F-test between each pair of subsequent models and reports fit ```jldoctest julia> using DataFrames, GLM, StableRNGs + julia> data = DataFrame(y = (1:50).^2 .+ randn(StableRNG(1), 50), x = 1:50); + julia> ols_lin = lm(@formula(y ~ x), data); + julia> ols_sq = lm(@formula(y ~ x + x^2), data); + julia> ftest(ols_lin.model, ols_sq.model) F-test: 2 models fitted on 50 observations ───────────────────────────────────────────────────────────────────────────────── From 9480b0c341bf79196678ab3e2f525ed0a625e1a6 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sat, 3 Dec 2022 11:54:18 -0600 Subject: [PATCH 07/29] Merge branch 'master' of github.com:JuliaStats/GLM.jl into pa/resid --- Project.toml | 2 + docs/src/api.md | 4 +- docs/src/examples.md | 18 +- docs/src/index.md | 4 +- src/GLM.jl | 29 ++- src/deprecated.jl | 17 ++ src/ftest.jl | 5 +- src/glmfit.jl | 143 ++++++++++---- src/linpred.jl | 59 +++++- src/lm.jl | 116 +++++++---- src/negbinfit.jl | 8 +- test/runtests.jl | 448 ++++++++++++++++++++++++++++++++----------- 12 files changed, 649 insertions(+), 204 deletions(-) diff --git a/Project.toml b/Project.toml index 83b8d844..7b9bda7c 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] CSV = "0.7, 0.8" @@ -27,6 +28,7 @@ StatsAPI = "1.4" StatsBase = "0.33.5" StatsFuns = "0.6, 0.7, 0.8, 0.9, 1.0" StatsModels = "0.6.23" +Tables = "1" julia = "1" [extras] diff --git a/docs/src/api.md b/docs/src/api.md index 20723cf8..70d0356b 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -25,7 +25,7 @@ The most general approach to fitting a model is with the `fit` function, as in julia> using Random julia> fit(LinearModel, hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: +LinearModel Coefficients: ──────────────────────────────────────────────────────────────── @@ -41,7 +41,7 @@ This model can also be fit as julia> using Random julia> lm(hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: +LinearModel Coefficients: ──────────────────────────────────────────────────────────────── diff --git a/docs/src/examples.md b/docs/src/examples.md index 338d01c0..2eeb9f79 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -20,7 +20,7 @@ julia> data = DataFrame(X=[1,2,3], Y=[2,4,7]) 3 │ 3 7 julia> ols = lm(@formula(Y ~ X), data) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +LinearModel Y ~ 1 + X @@ -99,7 +99,7 @@ julia> data = DataFrame(X=[1,2,2], Y=[1,0,1]) 3 │ 2 1 julia> probit = glm(@formula(Y ~ X), data, Binomial(), ProbitLink()) -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +GeneralizedLinearModel Y ~ 1 + X @@ -140,7 +140,7 @@ julia> quine = dataset("MASS", "quine") 131 rows omitted julia> nbrmodel = glm(@formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0), LogLink()) -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +GeneralizedLinearModel Days ~ 1 + Eth + Sex + Age + Lrn @@ -158,7 +158,7 @@ Lrn: SL 0.296768 0.185934 1.60 0.1105 -0.0676559 0.661191 ──────────────────────────────────────────────────────────────────────────── julia> nbrmodel = negbin(@formula(Days ~ Eth+Sex+Age+Lrn), quine, LogLink()) -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +GeneralizedLinearModel Days ~ 1 + Eth + Sex + Age + Lrn @@ -175,7 +175,7 @@ Age: F3 0.356955 0.247228 1.44 0.1488 -0.127602 0.841513 Lrn: SL 0.292138 0.18565 1.57 0.1156 -0.0717297 0.656006 ──────────────────────────────────────────────────────────────────────────── -julia> println("Estimated theta = ", round(nbrmodel.model.rr.d.r, digits=5)) +julia> println("Estimated theta = ", round(nbrmodel.rr.d.r, digits=5)) Estimated theta = 1.27489 ``` @@ -207,7 +207,7 @@ julia> form = dataset("datasets", "Formaldehyde") 6 │ 0.9 0.782 julia> lm1 = fit(LinearModel, @formula(OptDen ~ Carb), form) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +LinearModel OptDen ~ 1 + Carb @@ -256,7 +256,7 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings") 35 rows omitted julia> fm2 = fit(LinearModel, @formula(SR ~ Pop15 + Pop75 + DPI + DDPI), LifeCycleSavings) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +LinearModel SR ~ 1 + Pop15 + Pop75 + DPI + DDPI @@ -364,7 +364,7 @@ julia> dobson = DataFrame(Counts = [18.,17,15,20,10,21,25,13,13], 9 │ 13.0 3 3 julia> gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ Outcome + Treatment), dobson, Poisson()) -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +GeneralizedLinearModel Counts ~ 1 + Outcome + Treatment @@ -421,7 +421,7 @@ julia> round(optimal_bic.minimizer, digits = 5) # Optimal λ 0.40935 julia> glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(optimal_bic.minimizer)) # Best model -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +GeneralizedLinearModel Volume ~ 1 + Height + Girth diff --git a/docs/src/index.md b/docs/src/index.md index 426aa1a5..12ca1f83 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -88,7 +88,7 @@ julia> data = DataFrame(y = rand(rng, 100), x = categorical(repeat([1, 2, 3, 4], julia> lm(@formula(y ~ x), data) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +LinearModel y ~ 1 + x @@ -113,7 +113,7 @@ julia> data = DataFrame(y = rand(StableRNG(1), 100), x = repeat([1, 2, 3, 4], 25 julia> lm(@formula(y ~ x), data, contrasts = Dict(:x => DummyCoding())) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}} +LinearModel y ~ 1 + x diff --git a/src/GLM.jl b/src/GLM.jl index 4b83d5dc..0e64f56a 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -10,12 +10,14 @@ module GLM import Base: (\), convert, show, size import LinearAlgebra: cholesky, cholesky! import Statistics: cor - import StatsBase: coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual, - loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, + import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual, + loglikelihood, nullloglikelihood, nobs, stderror, vcov, + residuals, predict, predict!, fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue import StatsFuns: xlogy import SpecialFunctions: erfc, erfcinv, digamma, trigamma import StatsModels: hasintercept + import Tables export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual, loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr², @@ -89,6 +91,29 @@ module GLM pivoted_cholesky!(A; kwargs...) = cholesky!(A, RowMaximum(); kwargs...) end + const COMMON_FIT_KWARGS_DOCS = """ + - `dropcollinear::Bool=true`: Controls whether or not a model matrix + less-than-full rank is accepted. + If `true` (the default) the coefficient for redundant linearly dependent columns is + `0.0` and all associated statistics are set to `NaN`. + Typically from a set of linearly-dependent columns the last ones are identified as redundant + (however, the exact selection of columns identified as redundant is not guaranteed). + - `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations. + Such weights are equivalent to repeating each observation a number of times equal + to its weight. Do note that this interpretation gives equal point estimates but + different standard errors from analytical (a.k.a. inverse variance) weights and + from probability (a.k.a. sampling) weights which are the default in some other + software. + Can be length 0 to indicate no weighting (default). + - `contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()`: a `Dict` mapping term names + (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts + (e.g., `HelmertCoding()`, `SeqDiffCoding(; levels=["a", "b", "c"])`, + etc.). If contrasts are not provided for a variable, the appropriate + term type will be guessed based on the data type from the data column: + any numeric data is assumed to be continuous, and any non-numeric data + is assumed to be categorical (with `DummyCoding()` as the default contrast type). + """ + include("linpred.jl") include("lm.jl") include("glmtools.jl") diff --git a/src/deprecated.jl b/src/deprecated.jl index f3f60243..e496a139 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -2,3 +2,20 @@ @deprecate confint(obj::LinearModel, level::Real) confint(obj, level=level) @deprecate confint(obj::AbstractGLM, level::Real) confint(obj, level=level) + +function Base.getproperty(mm::LinPredModel, f::Symbol) + if f === :model + Base.depwarn("accessing the `model` field of GLM.jl models is deprecated, " * + "as they are no longer wrapped in a `TableRegressionModel` " * + "and can be used directly now", :getproperty) + return mm + elseif f === :mf + Base.depwarn("accessing the `mf` field of GLM.jl models is deprecated, " * + "as they are no longer wrapped in a `TableRegressionModel`." * + "Use `formula(m)` to access the model formula.", :getproperty) + form = formula(mm) + return ModelFrame{Nothing, typeof(mm)}(form, nothing, nothing, typeof(mm)) + else + return getfield(mm, f) + end +end diff --git a/src/ftest.jl b/src/ftest.jl index 00a19437..b29f6644 100644 --- a/src/ftest.jl +++ b/src/ftest.jl @@ -108,8 +108,7 @@ julia> model = lm(@formula(Result ~ 1 + Treatment), dat); julia> bigmodel = lm(@formula(Result ~ 1 + Treatment + Other), dat); - -julia> ftest(nullmodel.model, model.model) +julia> ftest(nullmodel, model) F-test: 2 models fitted on 12 observations ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) @@ -118,7 +117,7 @@ F-test: 2 models fitted on 12 observations [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-07 ───────────────────────────────────────────────────────────────── -julia> ftest(nullmodel.model, model.model, bigmodel.model) +julia> ftest(nullmodel, model, bigmodel) F-test: 3 models fitted on 12 observations ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) diff --git a/src/glmfit.jl b/src/glmfit.jl index bd1d5548..f102c21c 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -229,6 +229,7 @@ abstract type AbstractGLM <: LinPredModel end mutable struct GeneralizedLinearModel{G<:GlmResp,L<:LinPred} <: AbstractGLM rr::G pp::L + formula::Union{FormulaTerm,Nothing} fit::Bool maxiter::Int minstepfac::Float64 @@ -236,8 +237,9 @@ mutable struct GeneralizedLinearModel{G<:GlmResp,L<:LinPred} <: AbstractGLM rtol::Float64 end -GeneralizedLinearModel(rr::GlmResp, pp::LinPred, fit::Bool) = - GeneralizedLinearModel(rr, pp, fit, 0, NaN, NaN, NaN) +GeneralizedLinearModel(rr::GlmResp, pp::LinPred, + f::Union{FormulaTerm, Nothing}, fit::Bool) = + GeneralizedLinearModel(rr, pp, f, fit, 0, NaN, NaN, NaN) function coeftable(mm::AbstractGLM; level::Real=0.95) cc = coef(mm) @@ -246,9 +248,10 @@ function coeftable(mm::AbstractGLM; level::Real=0.95) p = 2 * ccdf.(Ref(Normal()), abs.(zz)) ci = se*quantile(Normal(), (1-level)/2) levstr = isinteger(level*100) ? string(Integer(level*100)) : string(level*100) + cn = coefnames(mm) CoefTable(hcat(cc,se,zz,p,cc+ci,cc-ci), ["Coef.","Std. Error","z","Pr(>|z|)","Lower $levstr%","Upper $levstr%"], - ["x$i" for i = 1:size(mm.pp.X, 2)], 4, 3) + cn, 4, 3) end function confint(obj::AbstractGLM; level::Real=0.95) @@ -466,7 +469,6 @@ function StatsBase.fit!(m::AbstractGLM, y; wts=nothing, offset=nothing, - dofit::Bool=true, verbose::Bool=false, maxiter::Integer=30, minstepfac::Real=0.001, @@ -526,20 +528,8 @@ const FIT_GLM_DOC = """ for a list of built-in links). # Keyword Arguments - - `dropcollinear::Bool=true`: Controls whether or not `lm` accepts a model matrix which - is less-than-full rank. - If `true` (the default) the coefficient for redundant linearly dependent columns is - `0.0` and all associated statistics are set to `NaN`. - Typically from a set of linearly-dependent columns the last ones are identified as redundant - (however, the exact selection of columns identified as redundant is not guaranteed). - - `dofit::Bool=true`: Determines whether model will be fit - - `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations. - Such weights are equivalent to repeating each observation a number of times equal - to its weight. Do note that this interpretation gives equal point estimates but - different standard errors from analytical (a.k.a. inverse variance) weights and - from probability (a.k.a. sampling) weights which are the default in some other - software. - Can be length 0 to indicate no weighting (default). + - `dofit::Bool=true`: Determines whether model will be fit. Only supported with `glm`. + $COMMON_FIT_KWARGS_DOCS - `offset::Vector=similar(y,0)`: offset added to `Xβ` to form `eta`. Can be of length 0 - `verbose::Bool=false`: Display convergence information for each iteration @@ -569,10 +559,15 @@ function fit(::Type{M}, d::UnivariateDistribution, l::Link = canonicallink(d); dropcollinear::Bool = true, - dofit::Bool = true, + dofit::Union{Bool, Nothing} = nothing, wts::AbstractVector{<:Real} = similar(y, 0), offset::AbstractVector{<:Real} = similar(y, 0), fitargs...) where {M<:AbstractGLM} + if dofit === nothing + dofit = true + else + Base.depwarn("`dofit` argument to `fit` is deprecated", :fit) + end # Check that X and y have the same number of observations if size(X, 1) != size(y, 1) @@ -580,7 +575,7 @@ function fit(::Type{M}, end rr = GlmResp(y, d, l, offset, wts) - res = M(rr, cholpred(X, dropcollinear), false) + res = M(rr, cholpred(X, dropcollinear), nothing, false) return dofit ? fit!(res; fitargs...) : res end @@ -591,6 +586,37 @@ fit(::Type{M}, l::Link=canonicallink(d); kwargs...) where {M<:AbstractGLM} = fit(M, float(X), float(y), d, l; kwargs...) +function fit(::Type{M}, + f::FormulaTerm, + data, + d::UnivariateDistribution, + l::Link=canonicallink(d); + offset::Union{AbstractVector, Nothing} = nothing, + wts::Union{AbstractVector, Nothing} = nothing, + dropcollinear::Bool = true, + dofit::Union{Bool, Nothing} = nothing, + contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}(), + fitargs...) where {M<:AbstractGLM} + if dofit === nothing + dofit = true + else + Base.depwarn("`dofit` argument to `fit` is deprecated", :fit) + end + + f, (y, X) = modelframe(f, data, contrasts, M) + + # Check that X and y have the same number of observations + if size(X, 1) != size(y, 1) + throw(DimensionMismatch("number of rows in X and y must match")) + end + + off = offset === nothing ? similar(y, 0) : offset + wts = wts === nothing ? similar(y, 0) : wts + rr = GlmResp(y, d, l, off, wts) + res = M(rr, cholpred(X, dropcollinear), f, false) + return dofit ? fit!(res; fitargs...) : res +end + """ glm(formula, data, distr::UnivariateDistribution, link::Link = canonicallink(distr); ) @@ -631,13 +657,12 @@ function dispersion(m::AbstractGLM, sqr::Bool=false) end end +const PREDICT_COMMON = """ - predict(mm::AbstractGLM, newX::AbstractMatrix; offset::FPVector=eltype(newX)[], - interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95, - interval_method::Symbol = :transformation) - -Return the predicted response of model `mm` from covariate values `newX` and, -optionally, an `offset`. +`newX` must be either a table (in the [Tables.jl]((https://tables.juliadata.org/stable/) +definition) containing all columns used in the model formula, or a matrix with one column +for each predictor in the model. In both cases, each row represents an observation for +which a prediction will be returned. If `interval=:confidence`, also return upper and lower bounds for a given coverage `level`. By default (`interval_method = :transformation`) the intervals are constructed by applying @@ -647,11 +672,54 @@ response around the linear predictor. The `:delta` method intervals are symmetri the point estimates, but do not respect natural parameter constraints (e.g., the lower bound for a probability could be negative). """ + +""" + predict(mm::AbstractGLM, newX; + offset::FPVector=[], + interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, + interval_method::Symbol=:transformation) + +Return the predicted response of model `mm` from covariate values `newX` and, +optionally, an `offset`. + +$PREDICT_COMMON +""" function predict(mm::AbstractGLM, newX::AbstractMatrix; offset::FPVector=eltype(newX)[], interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, interval_method=:transformation) + r = response(mm) + len = size(newX, 1) + res = interval === nothing ? + similar(r, len) : + (prediction=similar(r, len), lower=similar(r, len), upper=similar(r, len)) + predict!(res, mm, newX, + offset=offset, interval=interval, level=level, + interval_method=interval_method) +end + +""" + predict!(res, mm::AbstractGLM, newX::AbstractMatrix; + offset::FPVector=eltype(newX)[], + interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, + interval_method::Symbol=:transformation) + +Store in `res` the predicted response of model `mm` from covariate values `newX` +and, optionally, an `offset`. `res` must be a vector with a length equal to the number +of rows in `newX` if `interval=nothing` (the default), and otherwise a `NamedTuple` +of vectors with names `prediction`, `lower` and `upper`. + +$PREDICT_COMMON +""" +function predict!(res::Union{AbstractVector, + NamedTuple{(:prediction, :lower, :upper), + <: NTuple{3, AbstractVector}}}, + mm::AbstractGLM, newX::AbstractMatrix; + offset::FPVector=eltype(newX)[], + interval::Union{Symbol,Nothing}=nothing, + level::Real=0.95, + interval_method=:transformation) eta = newX * coef(mm) if !isempty(mm.rr.offset) length(offset) == size(newX, 1) || @@ -660,11 +728,20 @@ function predict(mm::AbstractGLM, newX::AbstractMatrix; else length(offset) > 0 && throw(ArgumentError("fit without offset, so value of `offset` kw arg does not make sense")) end - mu = linkinv.(Link(mm), eta) if interval === nothing - return mu + res isa AbstractVector || + throw(ArgumentError("`res` must be a vector when `interval == nothing` or is omitted")) + length(res) == size(newX, 1) || + throw(DimensionMismatch("length of `res` must equal the number of rows in `newX`")) + res .= linkinv.(Link(mm), eta) elseif interval == :confidence + res isa NamedTuple || + throw(ArgumentError("`res` must be a `NamedTuple` when `interval == :confidence`")) + mu, lower, upper = res + length(mu) == length(lower) == length(upper) == size(newX, 1) || + throw(DimensionMismatch("length of vectors in `res` must equal the number of rows in `newX`")) + mu .= linkinv.(Link(mm), eta) normalquantile = quantile(Normal(), (1 + level)/2) # Compute confidence intervals in two steps # (2nd step varies depending on `interval_method`) @@ -677,19 +754,19 @@ function predict(mm::AbstractGLM, newX::AbstractMatrix; # 2. Now compute the variance for mu based on variance of eta and # construct intervals based on that (Delta method) stdmu = stdeta .* abs.(mueta.(Link(mm), eta)) - lower = mu .- normalquantile .* stdmu - upper = mu .+ normalquantile .* stdmu + lower .= mu .- normalquantile .* stdmu + upper .= mu .+ normalquantile .* stdmu elseif interval_method == :transformation # 2. Construct intervals for eta, then apply inverse link - lower = linkinv.(Link(mm), eta .- normalquantile .* stdeta) - upper = linkinv.(Link(mm), eta .+ normalquantile .* stdeta) + lower .= linkinv.(Link(mm), eta .- normalquantile .* stdeta) + upper .= linkinv.(Link(mm), eta .+ normalquantile .* stdeta) else throw(ArgumentError("interval_method can be only :transformation or :delta")) end else throw(ArgumentError("only :confidence intervals are defined")) end - (prediction = mu, lower = lower, upper = upper) + return res end # A helper function to choose default values for eta diff --git a/src/linpred.jl b/src/linpred.jl index 770e4062..6aca32dd 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -254,18 +254,35 @@ end stderror(x::LinPredModel) = sqrt.(diag(vcov(x))) function show(io::IO, obj::LinPredModel) - println(io, "$(typeof(obj)):\n\nCoefficients:\n", coeftable(obj)) + println(io, nameof(typeof(obj)), '\n') + obj.formula !== nothing && println(io, obj.formula, '\n') + println(io, "Coefficients:\n", coeftable(obj)) +end + +function modelframe(f::FormulaTerm, data, contrasts::AbstractDict, ::Type{M}) where M + Tables.istable(data) || + throw(ArgumentError("expected data in a Table, got $(typeof(data))")) + t = Tables.columntable(data) + msg = StatsModels.checknamesexist(f, t) + msg != "" && throw(ArgumentError(msg)) + data, _ = StatsModels.missing_omit(t, f) + sch = schema(f, data, contrasts) + f = apply_schema(f, sch, M) + f, modelcols(f, data) end -modelframe(obj::LinPredModel) = obj.fr modelmatrix(obj::LinPredModel) = obj.pp.X response(obj::LinPredModel) = obj.rr.y fitted(m::LinPredModel) = m.rr.mu predict(mm::LinPredModel) = fitted(mm) -StatsModels.formula(obj::LinPredModel) = modelframe(obj).formula residuals(obj::LinPredModel) = residuals(obj.rr) +function formula(obj::LinPredModel) + obj.formula === nothing && throw(ArgumentError("model was fitted without a formula")) + return obj.formula +end + """ nobs(obj::LinearModel) nobs(obj::GLM) @@ -283,6 +300,8 @@ end coef(x::LinPred) = x.beta0 coef(obj::LinPredModel) = coef(obj.pp) +coefnames(x::LinPredModel) = + x.formula === nothing ? ["x$i" for i in 1:length(coef(x))] : coefnames(formula(x).rhs) dof_residual(obj::LinPredModel) = nobs(obj) - dof(obj) + 1 @@ -290,3 +309,37 @@ hasintercept(m::LinPredModel) = any(i -> all(==(1), view(m.pp.X , :, i)), 1:size linpred_rank(x::LinPred) = length(x.beta0) linpred_rank(x::DensePredChol{<:Any, <:CholeskyPivoted}) = x.chol.rank + +_coltype(::ContinuousTerm{T}) where {T} = T + +# Function common to all LinPred models, but documented separately +# for LinearModel and GeneralizedLinearModel +function StatsBase.predict(mm::LinPredModel, data; + interval::Union{Symbol,Nothing}=nothing, + kwargs...) + Tables.istable(data) || + throw(ArgumentError("expected data in a Table, got $(typeof(data))")) + + f = formula(mm) + t = Tables.columntable(data) + cols, nonmissings = StatsModels.missing_omit(t, f.rhs) + newx = modelcols(f.rhs, cols) + prediction = Tables.allocatecolumn(Union{_coltype(f.lhs), Missing}, length(nonmissings)) + fill!(prediction, missing) + if interval === nothing + predict!(view(prediction, nonmissings), mm, newx; + interval=interval, kwargs...) + return prediction + else + # Finding integer indices once is faster + nonmissinginds = findall(nonmissings) + lower = Vector{Union{Float64, Missing}}(missing, length(nonmissings)) + upper = Vector{Union{Float64, Missing}}(missing, length(nonmissings)) + tup = (prediction=view(prediction, nonmissinginds), + lower=view(lower, nonmissinginds), + upper=view(upper, nonmissinginds)) + predict!(tup, mm, newx; + interval=interval, kwargs...) + return (prediction=prediction, lower=lower, upper=upper) + end +end diff --git a/src/lm.jl b/src/lm.jl index 2ee28022..dc7dd487 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -75,16 +75,19 @@ residuals(r::LmResp) = r.y - r.mu """ LinearModel -A combination of a [`LmResp`](@ref) and a [`LinPred`](@ref) +A combination of a [`LmResp`](@ref), a [`LinPred`](@ref), +and possibly a `FormulaTerm` # Members - `rr`: a `LmResp` object - `pp`: a `LinPred` object +- `f`: either a `FormulaTerm` object or `nothing` """ struct LinearModel{L<:LmResp,T<:LinPred} <: LinPredModel rr::L pp::T + formula::Union{FormulaTerm,Nothing} end LinearAlgebra.cholesky(x::LinearModel) = cholesky(x.pp) @@ -108,22 +111,14 @@ const FIT_LM_DOC = """ in columns (including if appropriate the intercept), and `y` must be a vector holding values of the dependent variable. - The keyword argument `wts` can be a `Vector` specifying frequency weights for observations. - Such weights are equivalent to repeating each observation a number of times equal - to its weight. Do note that this interpretation gives equal point estimates but - different standard errors from analytical (a.k.a. inverse variance) weights and - from probability (a.k.a. sampling) weights which are the default in some other - software. - - `dropcollinear` controls whether or not `lm` accepts a model matrix which - is less-than-full rank. If `true` (the default), only the first of each set of - linearly-dependent columns is used. The coefficient for redundant linearly dependent columns is - `0.0` and all associated statistics are set to `NaN`. + # Keyword Arguments + $COMMON_FIT_KWARGS_DOCS """ """ - fit(LinearModel, formula, data, allowrankdeficient=false; - [wts::AbstractVector], dropcollinear::Bool=true) + fit(LinearModel, formula::FormulaTerm, data; + [wts::AbstractVector], dropcollinear::Bool=true, + contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) fit(LinearModel, X::AbstractMatrix, y::AbstractVector; wts::AbstractVector=similar(y, 0), dropcollinear::Bool=true) @@ -140,12 +135,23 @@ function fit(::Type{LinearModel}, X::AbstractMatrix{<:Real}, y::AbstractVector{< "argument `dropcollinear` instead. Proceeding with positional argument value: $allowrankdeficient_dep" dropcollinear = allowrankdeficient_dep end - fit!(LinearModel(LmResp(y, wts), cholpred(X, dropcollinear))) + fit!(LinearModel(LmResp(y, wts), cholpred(X, dropcollinear), nothing)) +end + +function fit(::Type{LinearModel}, f::FormulaTerm, data, + allowrankdeficient_dep::Union{Bool,Nothing}=nothing; + wts::Union{AbstractVector{<:Real}, Nothing}=nothing, + dropcollinear::Bool=true, + contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) + f, (y, X) = modelframe(f, data, contrasts, LinearModel) + wts === nothing && (wts = similar(y, 0)) + fit!(LinearModel(LmResp(y, wts), cholpred(X, dropcollinear), f)) end """ - lm(formula, data, allowrankdeficient=false; - [wts::AbstractVector], dropcollinear::Bool=true) + lm(formula, data; + [wts::AbstractVector], dropcollinear::Bool=true, + contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) lm(X::AbstractMatrix, y::AbstractVector; wts::AbstractVector=similar(y, 0), dropcollinear::Bool=true) @@ -231,9 +237,10 @@ function coeftable(mm::LinearModel; level::Real=0.95) ci = [isnan(t) ? NaN : -Inf for t in tt] end levstr = isinteger(level*100) ? string(Integer(level*100)) : string(level*100) + cn = coefnames(mm) CoefTable(hcat(cc,se,tt,p,cc+ci,cc-ci), ["Coef.","Std. Error","t","Pr(>|t|)","Lower $levstr%","Upper $levstr%"], - ["x$i" for i = 1:size(mm.pp.X, 2)], 4, 3) + cn, 4, 3) end """ @@ -248,39 +255,70 @@ Valid values of `interval` are `:confidence` delimiting the uncertainty of the predicted relationship, and `:prediction` delimiting estimated bounds for new data points. """ function predict(mm::LinearModel, newx::AbstractMatrix; - interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95) - retmean = newx * coef(mm) + interval::Union{Symbol,Nothing}=nothing, level::Real=0.95) + retmean = similar(view(newx, :, 1)) + if interval === nothing + res = retmean + predict!(res, mm, newx) + else + res = (prediction=retmean, lower=similar(retmean), upper=similar(retmean)) + predict!(res, mm, newx, interval=interval, level=level) + end + return res +end + +function StatsModels.predict!(res::Union{AbstractVector, + NamedTuple{(:prediction, :lower, :upper), + <:NTuple{3, AbstractVector}}}, + mm::LinearModel, newx::AbstractMatrix; + interval::Union{Symbol, Nothing}=nothing, + level::Real=0.95) if interval === :confint Base.depwarn("interval=:confint is deprecated in favor of interval=:confidence", :predict) interval = :confidence end if interval === nothing - return retmean + res isa AbstractVector || + throw(ArgumentError("`res` must be a vector when `interval == nothing` or is omitted")) + length(res) == size(newx, 1) || + throw(DimensionMismatch("length of `res` must equal the number of rows in `newx`")) + res .= newx * coef(mm) elseif mm.pp.chol isa CholeskyPivoted && mm.pp.chol.rank < size(mm.pp.chol, 2) throw(ArgumentError("prediction intervals are currently not implemented " * "when some independent variables have been dropped " * "from the model due to collinearity")) - end - length(mm.rr.wts) == 0 || error("prediction with confidence intervals not yet implemented for weighted regression") - chol = cholesky!(mm.pp) - # get the R matrix from the QR factorization - if chol isa CholeskyPivoted - ip = invperm(chol.p) - R = chol.U[ip, ip] - else - R = chol.U - end - residvar = ones(size(newx,2)) * deviance(mm)/dof_residual(mm) - if interval == :confidence - retvariance = (newx/R).^2 * residvar - elseif interval == :prediction - retvariance = (newx/R).^2 * residvar .+ deviance(mm)/dof_residual(mm) else - error("only :confidence and :prediction intervals are defined") + res isa NamedTuple || + throw(ArgumentError("`res` must be a `NamedTuple` when `interval` is " * + "`:confidence` or `:prediction`")) + prediction, lower, upper = res + length(prediction) == length(lower) == length(upper) == size(newx, 1) || + throw(DimensionMismatch("length of vectors in `res` must equal the number of rows in `newx`")) + length(mm.rr.wts) == 0 || error("prediction with confidence intervals not yet implemented for weighted regression") + chol = cholesky!(mm.pp) + # get the R matrix from the QR factorization + if chol isa CholeskyPivoted + ip = invperm(chol.p) + R = chol.U[ip, ip] + else + R = chol.U + end + dev = deviance(mm) + dofr = dof_residual(mm) + residvar = fill(dev/dofr, size(newx, 2), 1) + ret = dropdims((newx/R).^2 * residvar, dims=2) + if interval == :prediction + ret .+= dev/dofr + elseif interval != :confidence + error("only :confidence and :prediction intervals are defined") + end + ret .= quantile(TDist(dofr), (1 - level)/2) .* sqrt.(ret) + prediction .= newx * coef(mm) + lower .= prediction .+ ret + upper .= prediction -+ ret end - retinterval = quantile(TDist(dof_residual(mm)), (1. - level)/2) * sqrt.(retvariance) - (prediction = retmean, lower = retmean .+ retinterval, upper = retmean .- retinterval) + return res end function confint(obj::LinearModel; level::Real=0.95) diff --git a/src/negbinfit.jl b/src/negbinfit.jl index 7c36127c..e01f0336 100644 --- a/src/negbinfit.jl +++ b/src/negbinfit.jl @@ -109,9 +109,9 @@ function negbin(F, maxiter=maxiter, atol=atol, rtol=rtol, verbose=verbose, kwargs...) end - μ = regmodel.model.rr.mu - y = regmodel.model.rr.y - wts = regmodel.model.rr.wts + μ = regmodel.rr.mu + y = regmodel.rr.y + wts = regmodel.rr.wts lw, ly = length(wts), length(y) if lw != ly && lw != 0 throw(ArgumentError("length of wts must be either $ly or 0 but was $lw")) @@ -132,7 +132,7 @@ function negbin(F, verbose && println("[ Alternating iteration ", i, ", θ = ", θ, " ]") regmodel = glm(F, D, NegativeBinomial(θ), args...; maxiter=maxiter, atol=atol, rtol=rtol, verbose=verbose, kwargs...) - μ = regmodel.model.rr.mu + μ = regmodel.rr.mu prevθ = θ θ = mle_for_θ(y, μ, wts; maxiter=maxiter, tol=rtol) δ = prevθ - θ diff --git a/test/runtests.jl b/test/runtests.jl index 78b78a2d..c51d9b25 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,7 +31,7 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y Σ = [6.136653061224592e-05 -9.464489795918525e-05 -9.464489795918525e-05 1.831836734693908e-04] @test isapprox(vcov(lm1), Σ) - @test isapprox(cor(lm1.model), Diagonal(diag(Σ))^(-1/2)*Σ*Diagonal(diag(Σ))^(-1/2)) + @test isapprox(cor(lm1), Diagonal(diag(Σ))^(-1/2)*Σ*Diagonal(diag(Σ))^(-1/2)) @test dof(lm1) == 3 @test isapprox(deviance(lm1), 0.0002992000000000012) @test isapprox(loglikelihood(lm1), 21.204842144047973) @@ -48,20 +48,20 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y @test isa(lm2.pp.chol, CholeskyPivoted) @test lm2.pp.chol.piv == [2, 1] @test isapprox(coef(lm1), coef(lm2) .* [1., 10.]) - lm3 = lm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), contrasts=Dict(:x=>DummyCoding())) - lm4 = lm(@formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5)))) - @test coef(lm3) == coef(lm4) ≈ [11, 1, 2, 3, 4] + # Deprecated methods + @test lm1.model === lm1 + @test lm1.mf.f == formula(lm1) end @testset "Linear Model Cook's Distance" begin - st_df = DataFrame( + st_df = DataFrame( Y=[6.4, 7.4, 10.4, 15.1, 12.3 , 11.4], XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], - XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], + XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], XC=[3., 13., 23., 39.8, 34., 31.], # values from SAS proc reg - CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], - CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], + CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], + CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157], ) @@ -69,7 +69,7 @@ end t_lm_base = lm(@formula(Y ~ XA), st_df) @test isapprox(st_df.CooksD_base, cooksdistance(t_lm_base)) - # linear regression, no intercept + # linear regression, no intercept t_lm_noint = lm(@formula(Y ~ XA +0), st_df) @test isapprox(st_df.CooksD_noint, cooksdistance(t_lm_noint)) @@ -85,7 +85,7 @@ end end -@testset "linear model with weights" begin +@testset "linear model with weights" begin df = dataset("quantreg", "engel") N = nrow(df) df.weights = repeat(1:5, Int(N/5)) @@ -97,7 +97,7 @@ end @test isapprox(stderror(lm_model), [9.382302620120193, 0.00816741377772968]) @test isapprox(r2(lm_model), 0.8330258148644486) @test isapprox(adjr2(lm_model), 0.832788298242634) - @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813; + @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813; -0.06772589439264813 6.670664781664879e-5]) @test isapprox(first(predict(lm_model)), 357.57694841780994) @test isapprox(loglikelihood(lm_model), -4353.946729075838) @@ -155,7 +155,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 1, 2] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -166,7 +166,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 2, 3] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -177,7 +177,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 1, 2] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -188,7 +188,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 2, 3] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -201,7 +201,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 1, 2, 0, 0] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -223,7 +223,7 @@ end mdl = lm(@formula(y ~ 0 + x), data) @test coef(mdl) ≈ [2.07438016528926] @test stderror(mdl) ≈ [0.165289256198347E-01] - @test GLM.dispersion(mdl.model) ≈ 3.56753034006338 + @test GLM.dispersion(mdl) ≈ 3.56753034006338 @test dof(mdl) == 2 @test dof_residual(mdl) == 10 @test r2(mdl) ≈ 0.999365492298663 @@ -246,7 +246,7 @@ end mdl = lm(@formula(y ~ 0 + x), data) @test coef(mdl) ≈ [0.727272727272727] @test stderror(mdl) ≈ [0.420827318078432E-01] - @test GLM.dispersion(mdl.model) ≈ 0.369274472937998 + @test GLM.dispersion(mdl) ≈ 0.369274472937998 @test dof(mdl) == 2 @test dof_residual(mdl) == 2 @test r2(mdl) ≈ 0.993348115299335 @@ -268,7 +268,7 @@ end @test coef(mdl1) ≈ coef(mdl2) @test stderror(mdl1) ≈ stderror(mdl2) - @test GLM.dispersion(mdl1.model) ≈ GLM.dispersion(mdl2) + @test GLM.dispersion(mdl1) ≈ GLM.dispersion(mdl2) @test dof(mdl1) ≈ dof(mdl2) @test dof_residual(mdl1) ≈ dof_residual(mdl2) @test r2(mdl1) ≈ r2(mdl2) @@ -289,7 +289,7 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], @testset "Poisson GLM" begin gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ 1 + Outcome + Treatment), dobson, Poisson()) - @test GLM.cancancel(gm1.model.rr) + @test GLM.cancancel(gm1.rr) test_show(gm1) @test dof(gm1) == 5 @test isapprox(deviance(gm1), 5.12914107700115, rtol = 1e-7) @@ -301,7 +301,7 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], @test isapprox(bic(gm1), 57.74744128863877) @test isapprox(coef(gm1)[1:3], [3.044522437723423,-0.45425527227759555,-0.29298712468147375]) - + @test isapprox(residuals(gm1; type=:deviance), [-0.6712492, 0.9627236, -0.1696466, -0.2199851, -0.9555235, 1.049386, 0.8471537, -0.09167147, -0.9665637]; @@ -311,14 +311,17 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], # -0.9128709, 1.094797, 0.8728716, -0.09128709, -0.9263671]; # atol=1e-6) @test isapprox(residuals(gm1; type=:response), - [-3, 3.666667, -0.6666667, -1, -3.333333, + [-3, 3.666667, -0.6666667, -1, -3.333333, 4.333333, 4, -0.3333333, -3.666667]; atol=1e-6) @test isapprox(residuals(gm1; type=:working), - [-0.1428571, 0.275, -0.04255319, -0.04761905, + [-0.1428571, 0.275, -0.04255319, -0.04761905, -0.25, 0.2765957, 0.1904762, -0.025, -0.2340426]; atol=1e-6) - + + # Deprecated methods + @test gm1.model === gm1 + @test gm1.mf.f == formula(gm1) end ## Example from http://www.ats.ucla.edu/stat/r/dae/logit.htm @@ -327,7 +330,7 @@ admit.rank = categorical(admit.rank) @testset "$distr with LogitLink" for distr in (Binomial, Bernoulli) gm2 = fit(GeneralizedLinearModel, @formula(admit ~ 1 + gre + gpa + rank), admit, distr()) - @test GLM.cancancel(gm2.model.rr) + @test GLM.cancancel(gm2.rr) test_show(gm2) @test dof(gm2) == 6 @test deviance(gm2) ≈ 458.5174924758994 @@ -358,8 +361,8 @@ admit.rank = categorical(admit.rank) 0.7465767, -0.48867, -0.1655043, -0.1810622, -0.4636674, -0.3007306]; atol=1e-6) # res = residuals(gm2; type=:pearson) # @test isapprox(res[1:10], - # [-0.4567757, 1.556473, 0.5952011, 2.146128, -0.3663905, - # 1.304961, 1.176959, -0.5264452, 1.995417, -1.036398]; atol=1e-6) + # [-0.4567757, 1.556473, 0.5952011, 2.146128, -0.3663905, + # 1.304961, 1.176959, -0.5264452, 1.995417, -1.036398]; atol=1e-6) # @test isapprox(res[390:400], # [-0.8211834, 1.22236, 1.025949, 1.871874, 1.131075, # 1.716382, -0.977591, -0.4453409, -0.4702063, -0.9297929]; atol=1e-6) @@ -374,7 +377,7 @@ end gm3 = fit(GeneralizedLinearModel, @formula(admit ~ 1 + gre + gpa + rank), admit, Binomial(), ProbitLink()) test_show(gm3) - @test !GLM.cancancel(gm3.model.rr) + @test !GLM.cancancel(gm3.rr) @test dof(gm3) == 6 @test isapprox(deviance(gm3), 458.4131713833386) @test isapprox(nulldeviance(gm3), 499.9765175549236) @@ -391,7 +394,7 @@ end @testset "Bernoulli CauchitLink" begin gm4 = fit(GeneralizedLinearModel, @formula(admit ~ gre + gpa + rank), admit, Binomial(), CauchitLink()) - @test !GLM.cancancel(gm4.model.rr) + @test !GLM.cancancel(gm4.rr) test_show(gm4) @test dof(gm4) == 6 @test isapprox(deviance(gm4), 459.3401112751141) @@ -406,7 +409,7 @@ end @testset "Bernoulli CloglogLink" begin gm5 = fit(GeneralizedLinearModel, @formula(admit ~ gre + gpa + rank), admit, Binomial(), CloglogLink()) - @test !GLM.cancancel(gm5.model.rr) + @test !GLM.cancancel(gm5.rr) test_show(gm5) @test dof(gm5) == 6 @test isapprox(deviance(gm5), 458.89439629612616) @@ -436,8 +439,7 @@ anorexia = CSV.read(joinpath(glm_datadir, "anorexia.csv"), DataFrame) @testset "Normal offset" begin gm6 = fit(GeneralizedLinearModel, @formula(Postwt ~ 1 + Prewt + Treat), anorexia, Normal(), IdentityLink(), offset=Array{Float64}(anorexia.Prewt)) - - @test GLM.cancancel(gm6.model.rr) + @test GLM.cancancel(gm6.rr) test_show(gm6) @test dof(gm6) == 5 @test isapprox(deviance(gm6), 3311.262619919613) @@ -449,7 +451,7 @@ anorexia = CSV.read(joinpath(glm_datadir, "anorexia.csv"), DataFrame) @test isapprox(bic(gm6), 501.35662813730465) @test isapprox(coef(gm6), [49.7711090, -0.5655388, -4.0970655, 4.5630627]) - @test isapprox(GLM.dispersion(gm6.model, true), 48.6950385282296) + @test isapprox(GLM.dispersion(gm6, true), 48.6950385282296) @test isapprox(stderror(gm6), [13.3909581, 0.1611824, 1.8934926, 2.1333359]) end @@ -457,8 +459,7 @@ end @testset "Normal LogLink offset" begin gm7 = fit(GeneralizedLinearModel, @formula(Postwt ~ 1 + Prewt + Treat), anorexia, Normal(), LogLink(), offset=anorexia.Prewt, rtol=1e-8) - - @test !GLM.cancancel(gm7.model.rr) + @test !GLM.cancancel(gm7.rr) test_show(gm7) @test isapprox(deviance(gm7), 3265.207242977156) @test isapprox(nulldeviance(gm7), 507625.1718547432) @@ -466,7 +467,7 @@ end @test isapprox(nullloglikelihood(gm7), -421.1535438334255) @test isapprox(coef(gm7), [3.99232679, -0.99445269, -0.05069826, 0.05149403]) - @test isapprox(GLM.dispersion(gm7.model, true), 48.017753573192266) + @test isapprox(GLM.dispersion(gm7, true), 48.017753573192266) @test isapprox(stderror(gm7), [0.157167944, 0.001886286, 0.022584069, 0.023882826], atol=1e-6) @@ -476,7 +477,7 @@ end gm7p = fit(GeneralizedLinearModel, @formula(round(Postwt) ~ 1 + Prewt + Treat), anorexia, Poisson(), LogLink(), offset=log.(anorexia.Prewt), rtol=1e-8) - @test GLM.cancancel(gm7p.model.rr) + @test GLM.cancancel(gm7p.rr) test_show(gm7p) @test deviance(gm7p) ≈ 39.686114742427705 @test nulldeviance(gm7p) ≈ 54.749010639715294 @@ -493,7 +494,7 @@ end Poisson(), LogLink(), offset=log.(anorexia.Prewt), wts=repeat(1:4, outer=18), rtol=1e-8) - @test GLM.cancancel(gm7pw.model.rr) + @test GLM.cancancel(gm7pw.rr) test_show(gm7pw) @test deviance(gm7pw) ≈ 90.17048668870225 @test nulldeviance(gm7pw) ≈ 139.63782826574652 @@ -511,8 +512,8 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), @testset "Gamma" begin gm8 = fit(GeneralizedLinearModel, @formula(lot1 ~ 1 + u), clotting, Gamma()) - @test !GLM.cancancel(gm8.model.rr) - @test isa(GLM.Link(gm8.model), InverseLink) + @test !GLM.cancancel(gm8.rr) + @test isa(GLM.Link(gm8), InverseLink) test_show(gm8) @test dof(gm8) == 3 @test isapprox(deviance(gm8), 0.016729715178484157) @@ -523,30 +524,30 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), @test isapprox(aicc(gm8), 42.78992394955449) @test isapprox(bic(gm8), 38.58159768156315) @test isapprox(coef(gm8), [-0.01655438172784895,0.01534311491072141]) - @test isapprox(GLM.dispersion(gm8.model, true), 0.002446059333495581, atol=1e-6) + @test isapprox(GLM.dispersion(gm8, true), 0.002446059333495581, atol=1e-6) @test isapprox(stderror(gm8), [0.00092754223, 0.000414957683], atol=1e-6) - + @test isapprox(residuals(gm8; type=:response), [-4.859041, 4.736111, 1.992869, 0.9973619, -1.065779, 0.02779383, -0.614323, -0.7318223, -0.4831699]; atol=1e-6) - @test isapprox(residuals(gm8; type=:working), + @test isapprox(residuals(gm8; type=:working), [0.0003219114, -0.001669384, -0.001245099, -0.0008626359, - 0.001353047, -4.456918e-05, 0.001314963, 0.001879625, 0.001414318]; + 0.001353047, -4.456918e-05, 0.001314963, 0.001879625, 0.001414318]; atol=1e-6) - @test isapprox(residuals(gm8; type=:deviance), + @test isapprox(residuals(gm8; type=:deviance), [-0.04008349, 0.08641118, 0.04900896, 0.02904992, - -0.03846595, 0.001112578, -0.02869586, -0.03755713, -0.0263724]; + -0.03846595, 0.001112578, -0.02869586, -0.03755713, -0.0263724]; atol=1e-6) - # @test isapprox(residuals(gm8; type=:pearson), + # @test isapprox(residuals(gm8; type=:pearson), # [-0.03954973, 0.08891786, 0.04981284, 0.0293319, - # -0.03797433, 0.001112991, -0.02842204, -0.03708843, -0.02614107]; + # -0.03797433, 0.001112991, -0.02842204, -0.03708843, -0.02614107]; # atol=1e-6) end @testset "InverseGaussian" begin gm8a = fit(GeneralizedLinearModel, @formula(lot1 ~ 1 + u), clotting, InverseGaussian()) - @test !GLM.cancancel(gm8a.model.rr) - @test isa(GLM.Link(gm8a.model), InverseSquareLink) + @test !GLM.cancancel(gm8a.rr) + @test isa(GLM.Link(gm8a), InverseSquareLink) test_show(gm8a) @test dof(gm8a) == 3 @test isapprox(deviance(gm8a), 0.006931128347234519) @@ -557,19 +558,19 @@ end @test isapprox(aicc(gm8a), 66.37485201769974) @test isapprox(bic(gm8a), 62.16652574970839) @test isapprox(coef(gm8a), [-0.0011079770504295668,0.0007219138982289362]) - @test isapprox(GLM.dispersion(gm8a.model, true), 0.0011008719709455776, atol=1e-6) + @test isapprox(GLM.dispersion(gm8a, true), 0.0011008719709455776, atol=1e-6) @test isapprox(stderror(gm8a), [0.0001675339726910311,9.468485015919463e-5], atol=1e-6) - - @test isapprox(residuals(gm8a; type=:response), + + @test isapprox(residuals(gm8a; type=:response), [-18.21078, 15.52523, 7.639634, 4.20793, -0.2428536, -0.3585344, -2.263445, -3.056904, -3.240284]; atol=1e-5) - @test isapprox(residuals(gm8a; type=:working), + @test isapprox(residuals(gm8a; type=:working), [1.441199e-05, -0.0004052051, -0.0003766424, -0.0002882584, 2.402242e-05, - 4.397323e-05, 0.0003595653, 0.0005697418, 0.0006762887]; + 4.397323e-05, 0.0003595653, 0.0005697418, 0.0006762887]; atol=1e-6) - @test isapprox(residuals(gm8a; type=:deviance), - [-0.01230767, 0.04799467, 0.03430759, 0.02309913, -0.001715577, - -0.002827721, -0.02123177, -0.03179512, -0.0359572]; + @test isapprox(residuals(gm8a; type=:deviance), + [-0.01230767, 0.04799467, 0.03430759, 0.02309913, -0.001715577, + -0.002827721, -0.02123177, -0.03179512, -0.0359572]; atol=1e-6) # @test isapprox(residuals(gm8a; type=:pearson), # [-0.01145543, 0.05608432, 0.03793027, 0.02462693, -0.001707913, @@ -579,7 +580,7 @@ end @testset "Gamma LogLink" begin gm9 = fit(GeneralizedLinearModel, @formula(lot1 ~ 1 + u), clotting, Gamma(), LogLink(), rtol=1e-8, atol=0.0) - @test !GLM.cancancel(gm9.model.rr) + @test !GLM.cancancel(gm9.rr) test_show(gm9) @test dof(gm9) == 3 @test deviance(gm9) ≈ 0.16260829451739 @@ -590,14 +591,14 @@ end @test aicc(gm9) ≈ 63.28165620769822 @test bic(gm9) ≈ 59.07332993970688 @test coef(gm9) ≈ [5.50322528458221, -0.60191617825971] - @test GLM.dispersion(gm9.model, true) ≈ 0.02435442293561081 + @test GLM.dispersion(gm9, true) ≈ 0.02435442293561081 @test stderror(gm9) ≈ [0.19030107482720, 0.05530784660144] end @testset "Gamma IdentityLink" begin gm10 = fit(GeneralizedLinearModel, @formula(lot1 ~ 1 + u), clotting, Gamma(), IdentityLink(), rtol=1e-8, atol=0.0) - @test !GLM.cancancel(gm10.model.rr) + @test !GLM.cancancel(gm10.rr) test_show(gm10) @test dof(gm10) == 3 @test isapprox(deviance(gm10), 0.60845414895344) @@ -608,7 +609,7 @@ end @test isapprox(aicc(gm10), 75.23214487456835) @test isapprox(bic(gm10), 71.02381860657701) @test isapprox(coef(gm10), [99.250446880986, -18.374324929002]) - @test isapprox(GLM.dispersion(gm10.model, true), 0.10417373, atol=1e-6) + @test isapprox(GLM.dispersion(gm10, true), 0.10417373, atol=1e-6) @test isapprox(stderror(gm10), [17.864084, 4.297895], atol=1e-4) end @@ -695,7 +696,7 @@ end quine = dataset("MASS", "quine") @testset "NegativeBinomial LogLink Fixed θ" begin gm18 = fit(GeneralizedLinearModel, @formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0), LogLink()) - @test !GLM.cancancel(gm18.model.rr) + @test !GLM.cancancel(gm18.rr) test_show(gm18) @test dof(gm18) == 8 @test isapprox(deviance(gm18), 239.11105911824325, rtol = 1e-7) @@ -713,7 +714,7 @@ end @testset "NegativeBinomial NegativeBinomialLink Fixed θ" begin # the default/canonical link is NegativeBinomialLink gm19 = fit(GeneralizedLinearModel, @formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0)) - @test GLM.cancancel(gm19.model.rr) + @test GLM.cancancel(gm19.rr) test_show(gm19) @test dof(gm19) == 8 @test isapprox(deviance(gm19), 239.68562048977307, rtol = 1e-7) @@ -791,8 +792,8 @@ end @test aic(gm22) ≈ 1112.7422553284146 @test aicc(gm22) ≈ 1113.7933502189255 @test bic(gm22) ≈ 1136.6111083020812 - @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235, - -0.4497584898742737, 0.08622664933901254, 0.3558996662512287, + @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235, + -0.4497584898742737, 0.08622664933901254, 0.3558996662512287, 0.29016080736927813] @test stderror(gm22) ≈ [0.22754287093719366, 0.15274755092180423, 0.15928431669166637, 0.23853372776980591, 0.2354231414867577, 0.24750780320597515, @@ -817,9 +818,9 @@ end @testset "GLM with no intercept" begin # Gamma with single numeric predictor nointglm1 = fit(GeneralizedLinearModel, @formula(lot1 ~ 0 + u), clotting, Gamma()) - @test !hasintercept(nointglm1.model) - @test !GLM.cancancel(nointglm1.model.rr) - @test isa(GLM.Link(nointglm1.model), InverseLink) + @test !hasintercept(nointglm1) + @test !GLM.cancancel(nointglm1.rr) + @test isa(GLM.Link(nointglm1), InverseLink) test_show(nointglm1) @test dof(nointglm1) == 2 @test deviance(nointglm1) ≈ 0.6629903395245351 @@ -830,13 +831,13 @@ end @test aicc(nointglm1) ≈ 71.21377945777526 @test bic(nointglm1) ≈ 69.6082286124477 @test coef(nointglm1) ≈ [0.009200201253724151] - @test GLM.dispersion(nointglm1.model, true) ≈ 0.10198331431820506 + @test GLM.dispersion(nointglm1, true) ≈ 0.10198331431820506 @test stderror(nointglm1) ≈ [0.000979309363228589] # Bernoulli with numeric predictors nointglm2 = fit(GeneralizedLinearModel, @formula(admit ~ 0 + gre + gpa), admit, Bernoulli()) - @test !hasintercept(nointglm2.model) - @test GLM.cancancel(nointglm2.model.rr) + @test !hasintercept(nointglm2) + @test GLM.cancancel(nointglm2.rr) test_show(nointglm2) @test dof(nointglm2) == 2 @test deviance(nointglm2) ≈ 503.5584368354113 @@ -853,8 +854,8 @@ end nointglm3 = fit(GeneralizedLinearModel, @formula(round(Postwt) ~ 0 + Prewt + Treat), anorexia, Poisson(), LogLink(); offset=log.(anorexia.Prewt), wts=repeat(1:4, outer=18), rtol=1e-8, dropcollinear=false) - @test !hasintercept(nointglm3.model) - @test GLM.cancancel(nointglm3.model.rr) + @test !hasintercept(nointglm3) + @test GLM.cancancel(nointglm3.rr) test_show(nointglm3) @test deviance(nointglm3) ≈ 90.17048668870225 @test nulldeviance(nointglm3) ≈ 159.32999067102548 @@ -900,6 +901,7 @@ end end @testset "Predict" begin + # Binomial GLM rng = StableRNG(123) X = rand(rng, 10, 2) Y = logistic.(X * [3; -3]) @@ -907,7 +909,7 @@ end gm11 = fit(GeneralizedLinearModel, X, Y, Binomial()) @test isapprox(predict(gm11), Y) @test predict(gm11) == fitted(gm11) - + newX = rand(rng, 5, 2) newY = logistic.(newX * coef(gm11)) gm11_pred1 = predict(gm11, newX) @@ -931,6 +933,28 @@ end @test ndims(gm11_pred3.upper) == 1 @test ndims(gm11_pred3.lower) == 1 + @test predict!(similar(Y, size(newX, 1)), gm11, newX) == predict(gm11, newX) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + gm11, newX, interval=:confidence, interval_method=:delta) == + predict(gm11, newX, interval=:confidence, interval_method=:delta) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + gm11, newX, interval=:confidence, interval_method=:transformation) == + predict(gm11, newX, interval=:confidence, interval_method=:transformation) + @test_throws ArgumentError predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), gm11, newX) + @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), gm11, newX, + interval=:confidence) + @test_throws DimensionMismatch predict!([1], gm11, newX) + @test_throws DimensionMismatch predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=[1]), + gm11, newX, interval=:confidence) + off = rand(rng, 10) newoff = rand(rng, 5) @@ -941,17 +965,67 @@ end @test isapprox(predict(gm12, newX, offset=newoff), logistic.(newX * coef(gm12) .+ newoff)) + # Prediction from DataFrames d = DataFrame(X, :auto) d.y = Y gm13 = fit(GeneralizedLinearModel, @formula(y ~ 0 + x1 + x2), d, Binomial()) - @test predict(gm13) ≈ predict(gm13, d[:,[:x1, :x2]]) - @test predict(gm13) ≈ predict(gm13, d) + @test predict(gm13) ≈ predict(gm13, d[:,[:x1, :x2]]) == predict(gm13, X) + @test predict(gm13) ≈ predict(gm13, d) == predict(gm13, X) + @test predict(gm13) ≈ predict(gm11) + @test predict(gm13, newX; interval=:confidence, interval_method=:delta) == + predict(gm11, newX; interval=:confidence, interval_method=:delta) + @test predict(gm13, newX; interval=:confidence, interval_method=:transformation) == + predict(gm11, newX; interval=:confidence, interval_method=:transformation) newd = DataFrame(newX, :auto) - predict(gm13, newd) - + @test predict(gm13, newd) == predict(gm13, newX) + @test predict(gm13, newX; interval=:confidence, interval_method=:delta) == + predict(gm11, newX; interval=:confidence, interval_method=:delta) + @test predict(gm13, newd; interval=:confidence, interval_method=:transformation) == + predict(gm11, newX; interval=:confidence, interval_method=:transformation) + + + # Prediction from DataFrames with missing values + drep = d[[1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10], :] + dm = allowmissing(drep) + dm.x1[3] = missing + dm.y[9] = missing + + gm13m = fit(GeneralizedLinearModel, @formula(y ~ 0 + x1 + x2), dm, Binomial()) + @test predict(gm13m) == predict(gm13) + @test predict(gm13m, d) == predict(gm13, d) + @test isequal(predict(gm13m, dm), predict(gm13, dm)) + gm13m_pred2 = predict(gm13m, dm; interval=:confidence, interval_method=:delta) + gm13m_pred3 = predict(gm13m, dm; interval=:confidence, interval_method=:transformation) + expected_pred = allowmissing(predict(gm13m, drep)) + expected_pred[3] = missing + @test collect(skipmissing(predict(gm13m, dm))) ≈ + collect(skipmissing(predict(gm13, dm))) ≈ + collect(skipmissing(gm13m_pred2.prediction)) ≈ + collect(skipmissing(gm13m_pred3.prediction)) ≈ + collect(skipmissing(expected_pred)) + @test ismissing.(predict(gm13m, dm)) == + ismissing.(predict(gm13, dm)) == + ismissing.(gm13m_pred2.prediction) == + ismissing.(gm13m_pred3.prediction) == + ismissing.(expected_pred) + expected_lower = + allowmissing(predict(gm13m, drep; + interval=:confidence, interval_method=:delta).lower) + expected_lower[3] = missing + @test collect(skipmissing(gm13m_pred2.lower)) ≈ collect(skipmissing(expected_lower)) + @test ismissing.(gm13m_pred2.lower) == ismissing.(expected_lower) + expected_upper = + allowmissing(predict(gm13m, drep; + interval=:confidence, interval_method=:delta).upper) + expected_upper[3] = missing + @test collect(skipmissing(gm13m_pred2.upper)) ≈ collect(skipmissing(expected_upper)) + @test ismissing.(gm13m_pred2.upper) == ismissing.(expected_upper) + + + # Linear Model Ylm = X * [0.8, 1.6] + 0.8randn(rng, 10) mm = fit(LinearModel, X, Ylm) pred1 = predict(mm, newX) @@ -979,6 +1053,88 @@ end @test pred3.upper ≈ pred3.prediction + quantile(TDist(dof_residual(mm)), 0.975)*sqrt.(diag(newX*vcov(mm)*newX') .+ deviance(mm)/dof_residual(mm)) ≈ [3.9288331595737196, 4.077092463922373, 4.762903743958081, 3.82184595169028, 4.034521019386702] + @test predict!(similar(Y, size(newX, 1)), mm, newX) == predict(mm, newX) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + mm, newX, interval=:confidence) == + predict(mm, newX, interval=:confidence) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + mm, newX, interval=:prediction) == + predict(mm, newX, interval=:prediction) + @test_throws ArgumentError predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), mm, newX) + @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), mm, newX, + interval=:confidence) + @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), mm, newX, + interval=:prediction) + @test_throws DimensionMismatch predict!([1], mm, newX) + @test_throws DimensionMismatch predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=[1]), + mm, newX, interval=:confidence) + + + # Prediction from DataFrames + d = DataFrame(X, :auto) + d.y = Ylm + + mmd = lm(@formula(y ~ 0 + x1 + x2), d) + @test predict(mmd) ≈ predict(mmd, d[:,[:x1, :x2]]) == predict(mm, X) + @test predict(mmd) ≈ predict(mmd, d) == predict(mm, X) + @test predict(mmd) ≈ predict(mm) + @test predict(mmd, newX; interval=:confidence) == + predict(mm, newX; interval=:confidence) + @test predict(mmd, newX; interval=:prediction) == + predict(mm, newX; interval=:prediction) + + newd = DataFrame(newX, :auto) + @test predict(mmd, newd) == predict(mm, newX) + @test predict(mmd, newd; interval=:confidence) == + predict(mm, newX; interval=:confidence) + @test predict(mmd, newd; interval=:prediction) == + predict(mm, newX; interval=:prediction) + + + # Prediction from DataFrames with missing values + drep = d[[1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10], :] + dm = allowmissing(drep) + dm.x1[3] = missing + dm.y[9] = missing + + mmdm = lm(@formula(y ~ 0 + x1 + x2), dm) + @test predict(mmdm) == predict(mmd) + @test predict(mmdm, d) == predict(mmd, d) + @test isequal(predict(mmdm, dm), predict(mmd, dm)) + mmdm_pred2 = predict(mmdm, dm; interval=:confidence) + mmdm_pred3 = predict(mmdm, dm; interval=:prediction) + expected_pred = allowmissing(predict(mmdm, drep)) + expected_pred[3] = missing + @test collect(skipmissing(predict(mmdm, dm))) ≈ + collect(skipmissing(predict(mmd, dm))) ≈ + collect(skipmissing(mmdm_pred2.prediction)) ≈ + collect(skipmissing(mmdm_pred3.prediction)) ≈ + collect(skipmissing(expected_pred)) + @test ismissing.(predict(mmdm, dm)) == + ismissing.(predict(mmdm, dm)) == + ismissing.(mmdm_pred2.prediction) == + ismissing.(mmdm_pred3.prediction) == + ismissing.(expected_pred) + expected_lower = + allowmissing(predict(mmdm, drep; interval=:confidence).lower) + expected_lower[3] = missing + @test collect(skipmissing(mmdm_pred2.lower)) ≈ collect(skipmissing(expected_lower)) + @test ismissing.(mmdm_pred2.lower) == ismissing.(expected_lower) + expected_upper = + allowmissing(predict(mmdm, drep; interval=:confidence).upper) + expected_upper[3] = missing + @test collect(skipmissing(mmdm_pred2.upper)) ≈ collect(skipmissing(expected_upper)) + @test ismissing.(mmdm_pred2.upper) == ismissing.(expected_upper) + + # Prediction with dropcollinear (#409) x = [1.0 1.0 1.0 2.0 @@ -1061,10 +1217,10 @@ end d = DataFrame(Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2.], Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])) - mod = lm(@formula(Result~Treatment), d).model - othermod = lm(@formula(Result~Other), d).model - nullmod = lm(@formula(Result~1), d).model - bothmod = lm(@formula(Result~Other+Treatment), d).model + mod = lm(@formula(Result~Treatment), d) + othermod = lm(@formula(Result~Other), d) + nullmod = lm(@formula(Result~1), d) + bothmod = lm(@formula(Result~Other+Treatment), d) nointerceptmod = lm(reshape(d.Treatment, :, 1), d.Result) ft1 = ftest(mod) @@ -1116,10 +1272,10 @@ end d = DataFrame(Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2.], Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])) - mod = lm(@formula(Result~Treatment), d).model - othermod = lm(@formula(Result~Other), d).model - nullmod = lm(@formula(Result~1), d).model - bothmod = lm(@formula(Result~Other+Treatment), d).model + mod = lm(@formula(Result~Treatment), d) + othermod = lm(@formula(Result~Other), d) + nullmod = lm(@formula(Result~1), d) + bothmod = lm(@formula(Result~Other+Treatment), d) @test StatsModels.isnested(nullmod, mod) @test !StatsModels.isnested(othermod, mod) @test StatsModels.isnested(mod, bothmod) @@ -1127,7 +1283,7 @@ end @test StatsModels.isnested(othermod, bothmod) d.Sum = d.Treatment + (d.Other .== 1) - summod = lm(@formula(Result~Sum), d).model + summod = lm(@formula(Result~Sum), d) @test StatsModels.isnested(summod, bothmod) ft1a = ftest(mod, nullmod) @@ -1277,7 +1433,7 @@ end @test hcat(t.cols[5:6]...) == confint(lm1) # TODO: call coeftable(gm1, ...) directly once DataFrameRegressionModel # supports keyword arguments - t = coeftable(lm1.model, level=0.99) + t = coeftable(lm1, level=0.99) @test hcat(t.cols[5:6]...) == confint(lm1, level=0.99) gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ 1 + Outcome + Treatment), @@ -1288,9 +1444,7 @@ end @test t.cols[4] ≈ [5.4267674619082684e-71, 0.024647114627808674, 0.12848651178787643, 0.9999999999999981, 0.9999999999999999] @test hcat(t.cols[5:6]...) == confint(gm1) - # TODO: call coeftable(gm1, ...) directly once DataFrameRegressionModel - # supports keyword arguments - t = coeftable(gm1.model, level=0.99) + t = coeftable(gm1, level=0.99) @test hcat(t.cols[5:6]...) == confint(gm1, level=0.99) end @@ -1370,10 +1524,10 @@ end Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])) - mod = lm(@formula(Result~Treatment), d).model + mod = lm(@formula(Result~Treatment), d) @test hasintercept(mod) - nullmod = lm(@formula(Result~1), d).model + nullmod = lm(@formula(Result~1), d) @test hasintercept(nullmod) nointerceptmod = lm(reshape(d.Treatment, :, 1), d.Result) @@ -1483,7 +1637,7 @@ end @test coef(mdl) ≈ [-0.05132238692134761, 0.01428684676273272, 0.15033126098228242] @test stderror(mdl) ≈ [0.224095414423756, 0.003342439119757, 0.005838227761632] atol=1.0e-8 @test dof(mdl) == 4 - @test GLM.dispersion(mdl.model, true) ≈ 6.577062388609384 + @test GLM.dispersion(mdl, true) ≈ 6.577062388609384 @test loglikelihood(mdl) ≈ -71.60507986987612 @test deviance(mdl) ≈ 184.15774688106 @test aic(mdl) ≈ 151.21015973975 @@ -1496,7 +1650,7 @@ end @test stderror(mdl1) ≈ stderror(mdl2) @test dof(mdl1) == dof(mdl2) @test dof_residual(mdl1) == dof_residual(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) @test confint(mdl1) ≈ confint(mdl2) @@ -1510,7 +1664,7 @@ end @test stderror(mdl1) ≈ stderror(mdl2) @test dof(mdl1) == dof(mdl2) @test dof_residual(mdl1) == dof_residual(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) @test confint(mdl1) ≈ confint(mdl2) @@ -1526,15 +1680,96 @@ end @test dof_residual(mdl1) == dof_residual(mdl2) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test confint(mdl1) ≈ confint(mdl2) @test aic(mdl1) ≈ aic(mdl2) @test predict(mdl1) ≈ predict(mdl2) end end +@testset "contrasts argument" begin + # DummyCoding (default) + m = lm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), + contrasts=Dict(:x=>DummyCoding())) + mcat = lm(@formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5)))) + gm = glm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + contrasts=Dict(:x=>DummyCoding())) + gmcat = glm(@formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5))), + Normal(), IdentityLink()) + @test coef(m) == coef(mcat) ≈ [11, 1, 2, 3, 4] + @test coef(gm) == coef(gmcat) ≈ [11, 1, 2, 3, 4] + + m = fit(LinearModel, @formula(y~x), (y=1:25, x=repeat(1:5, 5)), + contrasts=Dict(:x=>DummyCoding())) + mcat = fit(LinearModel, @formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5)))) + gm = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + contrasts=Dict(:x=>DummyCoding())) + gmcat = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=categorical(repeat(1:5, 5))), + Normal(), IdentityLink()) + @test coef(m) == coef(mcat) ≈ [11, 1, 2, 3, 4] + @test coef(gm) == coef(gmcat) ≈ [11, 1, 2, 3, 4] + + # EffectsCoding + m = lm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), + contrasts=Dict(:x=>EffectsCoding())) + gm = glm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + contrasts=Dict(:x=>EffectsCoding())) + @test coef(m) ≈ coef(gm) ≈ [13, -1, 0, 1, 2] + + m = fit(LinearModel, @formula(y~x), (y=1:25, x=repeat(1:5, 5)), + contrasts=Dict(:x=>EffectsCoding())) + gm = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + contrasts=Dict(:x=>EffectsCoding())) + @test coef(m) ≈ coef(gm) ≈ [13, -1, 0, 1, 2] +end + + +@testset "dofit argument" begin + gm1 = glm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + dofit=true) + gm2 = glm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + dofit=false) + @test gm1.fit + @test !gm2.fit + fit!(gm2) + @test gm2.fit + @test coef(gm1) == coef(gm2) ≈ [10, 1] + + # Deprecated + gm1 = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + dofit=true) + gm2 = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + dofit=false) + @test gm1.fit + @test !gm2.fit + fit!(gm2) + @test gm2.fit + @test coef(gm1) == coef(gm2) ≈ [10, 1] +end + +@testset "formula accessor" begin + m = lm(rand(10, 2), rand(10)) + @test_throws ArgumentError formula(m) + + m = glm(rand(10, 2), rand(10), Normal(), IdentityLink()) + @test_throws ArgumentError formula(m) + + df = DataFrame(x=["a", "b", "c"], y=[1, 2, 3]) + m = lm(@formula(y ~ x), df) + @test formula(m)::FormulaTerm === m.formula + + df = DataFrame(x=["a", "b", "c"], y=[1, 2, 3]) + m = glm(@formula(y ~ x), df, Normal(), IdentityLink()) + @test formula(m)::FormulaTerm === m.formula +end + @testset "dropcollinear with GLMs" begin - data = DataFrame(x1=[4, 5, 9, 6, 5], x2=[5, 3, 6, 7, 1], + data = DataFrame(x1=[4, 5, 9, 6, 5], x2=[5, 3, 6, 7, 1], x3=[4.2, 4.6, 8.4, 6.2, 4.2], y=[14, 14, 24, 20, 11]) @testset "Check normal with identity link against equivalent linear model" begin @@ -1547,7 +1782,7 @@ end @test isnan(stderror(mdl1)[4]) @test dof(mdl1) == dof(mdl2) @test dof_residual(mdl1) == dof_residual(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) @test aic(mdl1) ≈ aic(mdl2) @@ -1562,7 +1797,7 @@ end @test stderror(mdl1) ≈ stderror(mdl2) @test dof(mdl1) == dof(mdl2) @test dof_residual(mdl1) == dof_residual(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) @test aic(mdl1) ≈ aic(mdl2) @@ -1576,7 +1811,7 @@ end @test stderror(mdl)[1:3] ≈ [0.58371400875263, 0.10681694901238, 0.08531532203251] @test dof(mdl) == 4 @test dof_residual(mdl) == 2 - @test GLM.dispersion(mdl.model, true) ≈ 0.1341642228738996 + @test GLM.dispersion(mdl, true) ≈ 0.1341642228738996 @test deviance(mdl) ≈ 0.2683284457477991 @test loglikelihood(mdl) ≈ 0.2177608775670037 @test aic(mdl) ≈ 7.564478244866 @@ -1603,7 +1838,7 @@ end @test dof(mdl) == 3 @test dof_residual(mdl) == 98 @test aic(mdl) ≈ 141.68506068159 - @test GLM.dispersion(mdl.model, true) ≈ 1 + @test GLM.dispersion(mdl, true) ≈ 1 @test predict(mdl)[1:3] ≈ [0.4241893070433117, 0.3754516361306202, 0.6327877688720133] atol = 1.0E-6 @test confint(mdl)[1:2,1:2] ≈ [-0.5493329715011036 0.26350316142056085; -0.3582545657827583 0.64313795309765587] atol = 1.0E-1 @@ -1633,7 +1868,7 @@ end 3.9661379199401257, 5.079410103608552, 6.1944618141188625, 0.0, 7.930328728005131, 8.879994918604757, 2.986388408421915, 10.84972230524356, 11.844809275711485]) @test all(isnan, hcat(coeftable(m2p).cols[2:end]...)[7,:]) - + m2p_dep_pos = glm(Xmissingcell, ymissingcell, Normal()) @test_logs (:warn, "Positional argument `allowrankdeficient` is deprecated, use keyword " * "argument `dropcollinear` instead. Proceeding with positional argument value: true") fit(LinearModel, Xmissingcell, ymissingcell, true) @@ -1666,7 +1901,7 @@ end -0.820974608805111, -0.8581573302333557, -0.8838279927663583, 0.0, 0.667219148331652, 0.7087696966674913, 0.011287703617517712, 0.6816245514668273, 0.7250492032072612]) @test all(isnan, hcat(coeftable(m2p).cols[2:end]...)[7,:]) - + m2p_dep_pos = fit(GeneralizedLinearModel, Xmissingcell, ymissingcell, Gamma()) @test_logs (:warn, "Positional argument `allowrankdeficient` is deprecated, use keyword " * "argument `dropcollinear` instead. Proceeding with positional argument value: true") fit(LinearModel, Xmissingcell, ymissingcell, true) @@ -1677,14 +1912,13 @@ end end end - @testset "Floating point error in Binomial loglik" begin @test_throws InexactError GLM._safe_int(1.3) @test GLM._safe_int(1) === 1 # see issue 503 y, μ, wt, ϕ = 0.6376811594202898, 0.8492925285671102, 69.0, NaN # due to floating point: - # 1. y * wt == 43.99999999999999 + # 1. y * wt == 43.99999999999999 # 2. 44 / y == wt # 3. 44 / wt == y @test GLM.loglik_obs(Binomial(), y, μ, wt, ϕ) ≈ GLM.logpdf(Binomial(Int(wt), μ), 44) From 613f5fe4ae35cebeb7fa9b3b4474d83bc521dc3d Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 8 Dec 2022 16:41:02 -0600 Subject: [PATCH 08/29] add code for pearson --- src/glmfit.jl | 7 +++++-- test/runtests.jl | 36 ++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index f102c21c..b321b45e 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -821,8 +821,7 @@ function checky(y, d::Binomial) return nothing end -# need to add :pearson -const _RESIDUAL_TYPES = [:deviance, :response, :working] +const _RESIDUAL_TYPES = [:deviance, :pearson, :response, :working] """ residuals(model::GeneralizedLinearModel; type=:deviance) @@ -834,6 +833,8 @@ Supported values for `type` are: contribution to the deviance - `:response`: the difference between the observed and fitted values - `:working`: working residuals (used during the IRLS process) +- `:pearson`: Pearson residuals, i.e., response residuals scaled + by the standard error of the response """ function residuals(model::GeneralizedLinearModel; type=:deviance) type in _RESIDUAL_TYPES || @@ -846,6 +847,8 @@ function residuals(model::GeneralizedLinearModel; type=:deviance) # XXX I think this might be the same as # 2 * wrkresid, but I'm not 100% sure if that holds across families return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid) + elseif type == :pearson + return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.wrkwt) .* abs.(model.rr.wrkresid) elseif type == :working return model.rr.wrkresid else diff --git a/test/runtests.jl b/test/runtests.jl index c51d9b25..ab352182 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -306,10 +306,10 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], [-0.6712492, 0.9627236, -0.1696466, -0.2199851, -0.9555235, 1.049386, 0.8471537, -0.09167147, -0.9665637]; atol=1e-6) - # @test isapprox(residuals(gm1; type=:pearson), - # [-0.6546537, 1.004158, -0.1684304, -0.2182179, - # -0.9128709, 1.094797, 0.8728716, -0.09128709, -0.9263671]; - # atol=1e-6) + @test isapprox(residuals(gm1; type=:pearson), + [-0.6546537, 1.004158, -0.1684304, -0.2182179, + -0.9128709, 1.094797, 0.8728716, -0.09128709, -0.9263671]; + atol=1e-6) @test isapprox(residuals(gm1; type=:response), [-3, 3.666667, -0.6666667, -1, -3.333333, 4.333333, 4, -0.3333333, -3.666667]; @@ -359,13 +359,13 @@ admit.rank = categorical(admit.rank) @test isapprox(res[390:400], [-0.4027505, 0.5990641, 0.512806, 0.7779709, 0.5612748, 0.7465767, -0.48867, -0.1655043, -0.1810622, -0.4636674, -0.3007306]; atol=1e-6) - # res = residuals(gm2; type=:pearson) - # @test isapprox(res[1:10], - # [-0.4567757, 1.556473, 0.5952011, 2.146128, -0.3663905, - # 1.304961, 1.176959, -0.5264452, 1.995417, -1.036398]; atol=1e-6) - # @test isapprox(res[390:400], - # [-0.8211834, 1.22236, 1.025949, 1.871874, 1.131075, - # 1.716382, -0.977591, -0.4453409, -0.4702063, -0.9297929]; atol=1e-6) + res = residuals(gm2; type=:pearson) + @test isapprox(res[1:10], + [-0.4567757, 1.556473, 0.5952011, 2.146128, -0.3663905, + 1.304961, 1.176959, -0.5264452, 1.995417, -1.036398]; atol=1e-6) + @test isapprox(res[390:399], + [-0.8211834, 1.22236, 1.025949, 1.871874, 1.131075, + 1.716382, -0.977591, -0.4453409, -0.4702063, -0.9297929]; atol=1e-6) # less extensive test here because it's just grabbing the IRLS values, which are already # extensively tested @test isapprox(residuals(gm2; type=:working)[1:10], @@ -538,10 +538,10 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), [-0.04008349, 0.08641118, 0.04900896, 0.02904992, -0.03846595, 0.001112578, -0.02869586, -0.03755713, -0.0263724]; atol=1e-6) - # @test isapprox(residuals(gm8; type=:pearson), - # [-0.03954973, 0.08891786, 0.04981284, 0.0293319, - # -0.03797433, 0.001112991, -0.02842204, -0.03708843, -0.02614107]; - # atol=1e-6) + @test isapprox(residuals(gm8; type=:pearson), + [-0.03954973, 0.08891786, 0.04981284, 0.0293319, + -0.03797433, 0.001112991, -0.02842204, -0.03708843, -0.02614107]; + atol=1e-5) end @testset "InverseGaussian" begin @@ -572,9 +572,9 @@ end [-0.01230767, 0.04799467, 0.03430759, 0.02309913, -0.001715577, -0.002827721, -0.02123177, -0.03179512, -0.0359572]; atol=1e-6) - # @test isapprox(residuals(gm8a; type=:pearson), - # [-0.01145543, 0.05608432, 0.03793027, 0.02462693, -0.001707913, - # -0.00280766, -0.02017246, -0.02950971, -0.03310111]; atol=1e-6) + @test isapprox(residuals(gm8a; type=:pearson), + [-0.01145543, 0.05608432, 0.03793027, 0.02462693, -0.001707913, + -0.00280766, -0.02017246, -0.02950971, -0.03310111]; atol=1e-6) end @testset "Gamma LogLink" begin From dd491b5c863c4a6fb4685aaa8d10e3b736e64f57 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 8 Dec 2022 17:08:07 -0600 Subject: [PATCH 09/29] whitespace --- test/runtests.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index d85c57f4..90527334 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1297,7 +1297,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 3 0.1283 0.9603 + [1] 3 0.1283 0.9603 [2] 2 -1 3.2292 3.1008 0.0000 -0.9603 241.6234 <1e-07 ─────────────────────────────────────────────────────────────────""" else @@ -1306,7 +1306,7 @@ end ──────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ──────────────────────────────────────────────────────────────── - [1] 3 0.1283 0.9603 + [1] 3 0.1283 0.9603 [2] 2 -1 3.2292 3.1008 0.0000 -0.9603 241.6234 <1e-7 ────────────────────────────────────────────────────────────────""" end @@ -1320,7 +1320,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 2 3.2292 0.0000 + [1] 2 3.2292 0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-07 ─────────────────────────────────────────────────────────────────""" else @@ -1329,7 +1329,7 @@ end ──────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ──────────────────────────────────────────────────────────────── - [1] 2 3.2292 0.0000 + [1] 2 3.2292 0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 ────────────────────────────────────────────────────────────────""" end @@ -1345,7 +1345,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 2 3.2292 0.0000 + [1] 2 3.2292 0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-07 [3] 5 2 0.1017 -0.0266 0.9685 0.0082 1.0456 0.3950 ─────────────────────────────────────────────────────────────────""" @@ -1355,7 +1355,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 2 3.2292 0.0000 + [1] 2 3.2292 0.0000 [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 [3] 5 2 0.1017 -0.0266 0.9685 0.0082 1.0456 0.3950 ─────────────────────────────────────────────────────────────────""" @@ -1371,7 +1371,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 5 0.1017 0.9685 + [1] 5 0.1017 0.9685 [2] 3 -2 0.1283 0.0266 0.9603 -0.0082 1.0456 0.3950 [3] 2 -1 3.2292 3.1008 0.0000 -0.9603 241.6234 <1e-07 ─────────────────────────────────────────────────────────────────""" @@ -1381,7 +1381,7 @@ end ───────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) ───────────────────────────────────────────────────────────────── - [1] 5 0.1017 0.9685 + [1] 5 0.1017 0.9685 [2] 3 -2 0.1283 0.0266 0.9603 -0.0082 1.0456 0.3950 [3] 2 -1 3.2292 3.1008 0.0000 -0.9603 241.6234 <1e-7 ─────────────────────────────────────────────────────────────────""" From 5be6314e6a71148e6dd62aac35b0ed380088c5f1 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 8 Dec 2022 17:14:30 -0600 Subject: [PATCH 10/29] whitespace --- src/glmfit.jl | 35 ----------------------------------- test/runtests.jl | 28 ++++++++++++---------------- 2 files changed, 12 insertions(+), 51 deletions(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index b321b45e..490cc33b 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -820,38 +820,3 @@ function checky(y, d::Binomial) end return nothing end - -const _RESIDUAL_TYPES = [:deviance, :pearson, :response, :working] - -""" - residuals(model::GeneralizedLinearModel; type=:deviance) - -Return the residuals of a GLM. - -Supported values for `type` are: -- `:deviance` (the default): the signed square root of the element-wise - contribution to the deviance -- `:response`: the difference between the observed and fitted values -- `:working`: working residuals (used during the IRLS process) -- `:pearson`: Pearson residuals, i.e., response residuals scaled - by the standard error of the response -""" -function residuals(model::GeneralizedLinearModel; type=:deviance) - type in _RESIDUAL_TYPES || - throw(ArgumentError("Unsupported type `$(type)``; supported types are" * - "$(_RESIDUAL_TYPES)")) - # TODO: add in optimized method for normal with identity link - if type == :response - return response(model) - fitted(model) - elseif type == :deviance - # XXX I think this might be the same as - # 2 * wrkresid, but I'm not 100% sure if that holds across families - return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid) - elseif type == :pearson - return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.wrkwt) .* abs.(model.rr.wrkresid) - elseif type == :working - return model.rr.wrkresid - else - error("An error has occurred. Please file an issue on GitHub.") - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 90527334..e8f57d05 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -54,14 +54,14 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y end @testset "Linear Model Cook's Distance" begin - st_df = DataFrame( + st_df = DataFrame( Y=[6.4, 7.4, 10.4, 15.1, 12.3 , 11.4], XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], - XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], + XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], XC=[3., 13., 23., 39.8, 34., 31.], # values from SAS proc reg - CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], - CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], + CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], + CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157], ) @@ -69,7 +69,7 @@ end t_lm_base = lm(@formula(Y ~ XA), st_df) @test isapprox(st_df.CooksD_base, cooksdistance(t_lm_base)) - # linear regression, no intercept + # linear regression, no intercept t_lm_noint = lm(@formula(Y ~ XA +0), st_df) @test isapprox(st_df.CooksD_noint, cooksdistance(t_lm_noint)) @@ -85,7 +85,7 @@ end end -@testset "linear model with weights" begin +@testset "linear model with weights" begin df = dataset("quantreg", "engel") N = nrow(df) df.weights = repeat(1:5, Int(N/5)) @@ -97,17 +97,13 @@ end @test isapprox(stderror(lm_model), [9.382302620120193, 0.00816741377772968]) @test isapprox(r2(lm_model), 0.8330258148644486) @test isapprox(adjr2(lm_model), 0.832788298242634) - @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813; + @test isapprox(vcov(lm_model), [88.02760245551447 -0.06772589439264813; -0.06772589439264813 6.670664781664879e-5]) @test isapprox(first(predict(lm_model)), 357.57694841780994) @test isapprox(loglikelihood(lm_model), -4353.946729075838) @test isapprox(loglikelihood(glm_model), -4353.946729075838) @test isapprox(nullloglikelihood(lm_model), -4984.892139711452) - @test isapprox(mean(residuals(lm_model)), -5.412966629787718) - - # this should be elementwise true (which is a stronger condition than - # vectors being approximately equal) the so we test elementwise - @test all(residuals(glm_model; type=:response) .≈ residuals(lm_model)) + @test isapprox(mean(residuals(lm_model)), -5.412966629787718) end @testset "rankdeficient" begin @@ -792,8 +788,8 @@ end @test aic(gm22) ≈ 1112.7422553284146 @test aicc(gm22) ≈ 1113.7933502189255 @test bic(gm22) ≈ 1136.6111083020812 - @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235, - -0.4497584898742737, 0.08622664933901254, 0.3558996662512287, + @test coef(gm22)[1:7] ≈ [2.8978546663153897, -0.5701067649409168, 0.08040181505082235, + -0.4497584898742737, 0.08622664933901254, 0.3558996662512287, 0.29016080736927813] @test stderror(gm22) ≈ [0.22754287093719366, 0.15274755092180423, 0.15928431669166637, 0.23853372776980591, 0.2354231414867577, 0.24750780320597515, @@ -1769,7 +1765,7 @@ end end @testset "dropcollinear with GLMs" begin - data = DataFrame(x1=[4, 5, 9, 6, 5], x2=[5, 3, 6, 7, 1], + data = DataFrame(x1=[4, 5, 9, 6, 5], x2=[5, 3, 6, 7, 1], x3=[4.2, 4.6, 8.4, 6.2, 4.2], y=[14, 14, 24, 20, 11]) @testset "Check normal with identity link against equivalent linear model" begin @@ -1918,7 +1914,7 @@ end # see issue 503 y, μ, wt, ϕ = 0.6376811594202898, 0.8492925285671102, 69.0, NaN # due to floating point: - # 1. y * wt == 43.99999999999999 + # 1. y * wt == 43.99999999999999 # 2. 44 / y == wt # 3. 44 / wt == y @test GLM.loglik_obs(Binomial(), y, μ, wt, ϕ) ≈ GLM.logpdf(Binomial(Int(wt), μ), 44) From b58c3bd39cca30675649b2c304edb840fd749cfd Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 8 Dec 2022 17:15:19 -0600 Subject: [PATCH 11/29] whitespace --- src/glmtools.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glmtools.jl b/src/glmtools.jl index 1a7775e9..2c7c2ea0 100644 --- a/src/glmtools.jl +++ b/src/glmtools.jl @@ -527,7 +527,7 @@ function loglik_obs end loglik_obs(::Bernoulli, y, μ, wt, ϕ) = wt*logpdf(Bernoulli(μ), y) loglik_obs(::Binomial, y, μ, wt, ϕ) = logpdf(Binomial(Int(wt), μ), _safe_int(y*wt)) loglik_obs(::Gamma, y, μ, wt, ϕ) = wt*logpdf(Gamma(inv(ϕ), μ*ϕ), y) -# In Distributions.jl, a Geometric distribution characterizes the number of failures before +# In Distributions.jl, a Geometric distribution characterizes the number of failures before # the first success in a sequence of independent Bernoulli trials with success rate p. # The mean of Geometric distribution is (1 - p) / p. # Hence, p = 1 / (1 + μ). From 3cb863e70903af5eb3ab8016b8b4383ed49b903b Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 8 Dec 2022 17:17:30 -0600 Subject: [PATCH 12/29] again --- src/glmfit.jl | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/glmfit.jl b/src/glmfit.jl index 490cc33b..b321b45e 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -820,3 +820,38 @@ function checky(y, d::Binomial) end return nothing end + +const _RESIDUAL_TYPES = [:deviance, :pearson, :response, :working] + +""" + residuals(model::GeneralizedLinearModel; type=:deviance) + +Return the residuals of a GLM. + +Supported values for `type` are: +- `:deviance` (the default): the signed square root of the element-wise + contribution to the deviance +- `:response`: the difference between the observed and fitted values +- `:working`: working residuals (used during the IRLS process) +- `:pearson`: Pearson residuals, i.e., response residuals scaled + by the standard error of the response +""" +function residuals(model::GeneralizedLinearModel; type=:deviance) + type in _RESIDUAL_TYPES || + throw(ArgumentError("Unsupported type `$(type)``; supported types are" * + "$(_RESIDUAL_TYPES)")) + # TODO: add in optimized method for normal with identity link + if type == :response + return response(model) - fitted(model) + elseif type == :deviance + # XXX I think this might be the same as + # 2 * wrkresid, but I'm not 100% sure if that holds across families + return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid) + elseif type == :pearson + return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.wrkwt) .* abs.(model.rr.wrkresid) + elseif type == :working + return model.rr.wrkresid + else + error("An error has occurred. Please file an issue on GitHub.") + end +end From cfa33e0520677ca614a0772d800720f70eddb8e2 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 8 Dec 2022 18:04:31 -0600 Subject: [PATCH 13/29] Geometric and negative binomial --- test/runtests.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index e8f57d05..6bf70a43 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -809,6 +809,27 @@ end @test aicc(gm23) ≈ aicc(gm24) @test bic(gm23) ≈ bic(gm24) @test predict(gm23) ≈ predict(gm24) + + @test isapprox(residuals(gm23; type=:deviance)[1:10], + [-1.691255, -0.706297, -0.519149, -0.961134, -0.961134, + -0.234178, 0.17945, 0.279821, -0.803948, -0.803948]; + atol=1e-6) + @test isapprox(residuals(gm23; type=:pearson)[1:10], + [-0.902477, -0.550709, -0.433453, -0.680948, -0.680948, + -0.216258, 0.190345, 0.306518, -0.604398, -0.604398]; + atol=1e-5) + @test isapprox(residuals(gm23; type=:response)[1:10], + [-23.089885, -14.089885, -11.089885, -11.723055, -11.723055, + -3.723055, 3.276945, 5.276945, -9.919, -9.919]; + atol=1e-5) + @test isapprox(residuals(gm23; type=:working)[1:10], + [0.03668, 0.022383, 0.017617, 0.041919, 0.041919, + 0.013313, -0.011718, -0.018869, 0.039141, 0.039141]; + atol=1e-6) + + for rtype in GLM._RESIDUAL_TYPES + @test isapprox(residuals.([gm23, gm24]; type=rtype)...; atol=1e-6) + end end @testset "GLM with no intercept" begin From ee87fc780967af92bacd72030dd802f1574851d8 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 8 Dec 2022 18:13:02 -0600 Subject: [PATCH 14/29] residuals for normal --- test/runtests.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 6bf70a43..193e36cd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -104,6 +104,10 @@ end @test isapprox(loglikelihood(glm_model), -4353.946729075838) @test isapprox(nullloglikelihood(lm_model), -4984.892139711452) @test isapprox(mean(residuals(lm_model)), -5.412966629787718) + + # this should be elementwise true (which is a stronger condition than + # vectors being approximately equal) the so we test elementwise + @test all(residuals(glm_model) .≈ residuals(lm_model)) end @testset "rankdeficient" begin From dfa5f15d487804253224debb19bf7244035882ec Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Mon, 30 Jan 2023 01:44:46 +0530 Subject: [PATCH 15/29] Update test/runtests.jl --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 193e36cd..4d274de7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -107,7 +107,7 @@ end # this should be elementwise true (which is a stronger condition than # vectors being approximately equal) the so we test elementwise - @test all(residuals(glm_model) .≈ residuals(lm_model)) + @test all(residuals(glm_model, type = :working) .≈ residuals(lm_model)) end @testset "rankdeficient" begin From 8eb70a8a4ab1f698d1c2086deff5ce2d7f5e51cf Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 17:51:40 +0530 Subject: [PATCH 16/29] Update src/glmfit.jl Co-authored-by: Alex Arslan --- src/glmfit.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index b321b45e..97a99fc8 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -847,9 +847,9 @@ function residuals(model::GeneralizedLinearModel; type=:deviance) # XXX I think this might be the same as # 2 * wrkresid, but I'm not 100% sure if that holds across families return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid) - elseif type == :pearson - return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.wrkwt) .* abs.(model.rr.wrkresid) - elseif type == :working + elseif type === :pearson + return copysign.(model.rr.wrkresid, response(model) .- fitted(model)) .* sqrt.(model.rr.wrkwt) + elseif type === :working return model.rr.wrkresid else error("An error has occurred. Please file an issue on GitHub.") From c8b69b6258f46b83508eeeb5d53fcf26a9ee9380 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 17:52:14 +0530 Subject: [PATCH 17/29] Update src/glmfit.jl Co-authored-by: Alex Arslan --- src/glmfit.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index 97a99fc8..76035ba0 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -841,7 +841,7 @@ function residuals(model::GeneralizedLinearModel; type=:deviance) throw(ArgumentError("Unsupported type `$(type)``; supported types are" * "$(_RESIDUAL_TYPES)")) # TODO: add in optimized method for normal with identity link - if type == :response + if type === :response return response(model) - fitted(model) elseif type == :deviance # XXX I think this might be the same as From ae0b68000aa5af8386fe5ed55c03a8f2f34c7761 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 17:52:24 +0530 Subject: [PATCH 18/29] Update src/glmfit.jl Co-authored-by: Alex Arslan --- src/glmfit.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index 76035ba0..f23deeaf 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -843,7 +843,7 @@ function residuals(model::GeneralizedLinearModel; type=:deviance) # TODO: add in optimized method for normal with identity link if type === :response return response(model) - fitted(model) - elseif type == :deviance + elseif type === :deviance # XXX I think this might be the same as # 2 * wrkresid, but I'm not 100% sure if that holds across families return sign.(response(model) .- fitted(model)) .* sqrt.(model.rr.devresid) From 88345d68c49684636da222a6d20e0033216fd16d Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 17:52:40 +0530 Subject: [PATCH 19/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 6d5cd9f3..d582a779 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -304,7 +304,7 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], @test isapprox(residuals(gm1; type=:deviance), [-0.6712492, 0.9627236, -0.1696466, -0.2199851, -0.9555235, 1.049386, 0.8471537, -0.09167147, -0.9665637]; - atol=1e-6) + atol=1e-6) @test isapprox(residuals(gm1; type=:pearson), [-0.6546537, 1.004158, -0.1684304, -0.2182179, -0.9128709, 1.094797, 0.8728716, -0.09128709, -0.9263671]; From f55713a2b953844f064e1e32041068aee3ea0207 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 17:53:04 +0530 Subject: [PATCH 20/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index d582a779..debaebda 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -816,7 +816,7 @@ end @test isapprox(residuals(gm23; type=:deviance)[1:10], [-1.691255, -0.706297, -0.519149, -0.961134, -0.961134, -0.234178, 0.17945, 0.279821, -0.803948, -0.803948]; - atol=1e-6) + atol=1e-6) @test isapprox(residuals(gm23; type=:pearson)[1:10], [-0.902477, -0.550709, -0.433453, -0.680948, -0.680948, -0.216258, 0.190345, 0.306518, -0.604398, -0.604398]; From 64eda0be1f1d9a91709c1badbd6b8f804d7f3240 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 17:53:32 +0530 Subject: [PATCH 21/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index debaebda..d4f98d05 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -823,7 +823,7 @@ end atol=1e-5) @test isapprox(residuals(gm23; type=:response)[1:10], [-23.089885, -14.089885, -11.089885, -11.723055, -11.723055, - -3.723055, 3.276945, 5.276945, -9.919, -9.919]; + -3.723055, 3.276945, 5.276945, -9.919, -9.919]; atol=1e-5) @test isapprox(residuals(gm23; type=:working)[1:10], [0.03668, 0.022383, 0.017617, 0.041919, 0.041919, From 6bd8980a9022928eb8a70276fd1397e818127e21 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 17:54:33 +0530 Subject: [PATCH 22/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index d4f98d05..7c8b2b0c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -826,9 +826,9 @@ end -3.723055, 3.276945, 5.276945, -9.919, -9.919]; atol=1e-5) @test isapprox(residuals(gm23; type=:working)[1:10], - [0.03668, 0.022383, 0.017617, 0.041919, 0.041919, - 0.013313, -0.011718, -0.018869, 0.039141, 0.039141]; - atol=1e-6) + [0.03668, 0.022383, 0.017617, 0.041919, 0.041919, + 0.013313, -0.011718, -0.018869, 0.039141, 0.039141]; + atol=1e-6) for rtype in GLM._RESIDUAL_TYPES @test isapprox(residuals.([gm23, gm24]; type=rtype)...; atol=1e-6) From 3445319cef3c9b18d40b6b0547766ef79857c439 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 17:55:47 +0530 Subject: [PATCH 23/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 7c8b2b0c..c5b382e7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -314,9 +314,9 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], 4.333333, 4, -0.3333333, -3.666667]; atol=1e-6) @test isapprox(residuals(gm1; type=:working), - [-0.1428571, 0.275, -0.04255319, -0.04761905, - -0.25, 0.2765957, 0.1904762, -0.025, -0.2340426]; - atol=1e-6) + [-0.1428571, 0.275, -0.04255319, -0.04761905, + -0.25, 0.2765957, 0.1904762, -0.025, -0.2340426]; + atol=1e-6) # Deprecated methods @test gm1.model === gm1 From fe6800b4eec82295146431e8774b2e49c7b0efac Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 18:03:01 +0530 Subject: [PATCH 24/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index c5b382e7..baefb0ae 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -354,7 +354,7 @@ admit.rank = categorical(admit.rank) # values from R @test isapprox(res[1:10], [-0.1726265, 0.707825, 0.2615918, 0.8216154, -0.1183539, - 0.6300301, 0.5807538, -0.2170033, 0.7992648, -0.5178682], atol=1e-6) + 0.6300301, 0.5807538, -0.2170033, 0.7992648, -0.5178682]; atol=1e-6) @test isapprox(res[390:400], [-0.4027505, 0.5990641, 0.512806, 0.7779709, 0.5612748, 0.7465767, -0.48867, -0.1655043, -0.1810622, -0.4636674, -0.3007306]; atol=1e-6) From 282e3211d93919c4d831abe75d516269740ea56b Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 18:18:34 +0530 Subject: [PATCH 25/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index baefb0ae..0bf28401 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -540,7 +540,7 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), @test isapprox(residuals(gm8; type=:pearson), [-0.03954973, 0.08891786, 0.04981284, 0.0293319, -0.03797433, 0.001112991, -0.02842204, -0.03708843, -0.02614107]; - atol=1e-5) + atol=1e-5) end @testset "InverseGaussian" begin From 904e1580ccc7b2cec8cc96d3099ccd547b943d58 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 18:19:02 +0530 Subject: [PATCH 26/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 0bf28401..3f66febf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -532,7 +532,7 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), @test isapprox(residuals(gm8; type=:working), [0.0003219114, -0.001669384, -0.001245099, -0.0008626359, 0.001353047, -4.456918e-05, 0.001314963, 0.001879625, 0.001414318]; - atol=1e-6) + atol=1e-6) @test isapprox(residuals(gm8; type=:deviance), [-0.04008349, 0.08641118, 0.04900896, 0.02904992, -0.03846595, 0.001112578, -0.02869586, -0.03755713, -0.0263724]; From 1a1cbeb37cd7fad540ea3d823c6b2f5832b91937 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 18:22:40 +0530 Subject: [PATCH 27/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 3f66febf..e4240bc6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -536,7 +536,7 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), @test isapprox(residuals(gm8; type=:deviance), [-0.04008349, 0.08641118, 0.04900896, 0.02904992, -0.03846595, 0.001112578, -0.02869586, -0.03755713, -0.0263724]; - atol=1e-6) + atol=1e-6) @test isapprox(residuals(gm8; type=:pearson), [-0.03954973, 0.08891786, 0.04981284, 0.0293319, -0.03797433, 0.001112991, -0.02842204, -0.03708843, -0.02614107]; From d90bab58082202ab77a1c4c80b2933ad6595af88 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 18:23:08 +0530 Subject: [PATCH 28/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index e4240bc6..526c477f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -566,7 +566,7 @@ end @test isapprox(residuals(gm8a; type=:working), [1.441199e-05, -0.0004052051, -0.0003766424, -0.0002882584, 2.402242e-05, 4.397323e-05, 0.0003595653, 0.0005697418, 0.0006762887]; - atol=1e-6) + atol=1e-6) @test isapprox(residuals(gm8a; type=:deviance), [-0.01230767, 0.04799467, 0.03430759, 0.02309913, -0.001715577, -0.002827721, -0.02123177, -0.03179512, -0.0359572]; From e9865beeccb868a155045c98d57486667c2a1e03 Mon Sep 17 00:00:00 2001 From: Mousum Dutta <44145580+mousum-github@users.noreply.github.com> Date: Thu, 4 May 2023 18:23:45 +0530 Subject: [PATCH 29/29] Update test/runtests.jl Co-authored-by: Alex Arslan --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 526c477f..1a4571b1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -569,7 +569,7 @@ end atol=1e-6) @test isapprox(residuals(gm8a; type=:deviance), [-0.01230767, 0.04799467, 0.03430759, 0.02309913, -0.001715577, - -0.002827721, -0.02123177, -0.03179512, -0.0359572]; + -0.002827721, -0.02123177, -0.03179512, -0.0359572]; atol=1e-6) @test isapprox(residuals(gm8a; type=:pearson), [-0.01145543, 0.05608432, 0.03793027, 0.02462693, -0.001707913,