Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Empirical CDF enhancements #590

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ const RealFP = Union{Float32, Float64}
const DepBool = Union{Bool, Nothing}

function depcheck(fname::Symbol, b::DepBool)
if b == nothing
if b === nothing
msg = "$fname will default to corrected=true in the future. Use corrected=false for previous behaviour."
Base.depwarn(msg, fname)
false
return false
else
b
return b
end
end
6 changes: 6 additions & 0 deletions src/deprecates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,9 @@ end
### Deprecated September 2019
@deprecate sum(A::AbstractArray, w::AbstractWeights, dims::Int) sum(A, w, dims=dims)
@deprecate values(wv::AbstractWeights) convert(Vector, wv)

### Deprecate August 2020 (v0.33)
function (ecdf::ECDF)(v::RealArray)
depwarn("(ecdf::ECDF)(v::RealArray) is deprecated, use `ecdf.(v)` broadcasting instead", "(ECDF::ecdf)(v::RealArray)")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More common, and possibly faster:

Suggested change
depwarn("(ecdf::ECDF)(v::RealArray) is deprecated, use `ecdf.(v)` broadcasting instead", "(ECDF::ecdf)(v::RealArray)")
depwarn("(ecdf::ECDF)(v::RealArray) is deprecated, use `ecdf.(v)` broadcasting instead", :ecdf)

return ecdf.(v)
end
162 changes: 118 additions & 44 deletions src/empirical.jl
Original file line number Diff line number Diff line change
@@ -1,68 +1,142 @@
# Empirical estimation of CDF and PDF

## Empirical CDF
"""
Empirical Cumulative Distribution Function (ECDF).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Empirical Cumulative Distribution Function (ECDF).
ECDF{T <: Real, W <: Real, I}
Empirical Cumulative Distribution Function (ECDF).


Represents ECDF constructed from random variable samples by [`ecdf`](@ref).

# Type parameters
* `T`: the type of random variable values
* `W`: the type of sample weights and CDF values
* `I`: boolean indicating whether to interpolate
the CDF between neighboring samples (continuous distribution)
or not (discrete distribution)
"""
struct ECDF{T <: Real, W <: Real, I}
# sorted random variable values and associated probabilities.
# The tuple provides:
# - `x[i]` (value of random variable)
# - `ECDF(x[i])`
# - `1/(x[i+1] - x[i])`
# - `ECDF(x[i+1]) - ECDF(x[i])` (the weight of `x[i+1]`)
sorted_values::Vector{Tuple{T, W, W, W}}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it less efficient to store values as a vector of tuples instead of as several vectors? In terms of memory use AFAIK alignment may require padding between entries.

end

# type of value weights and CDF values
weighttype(::Type{<:ECDF{<:Any, W}}) where W = W
weighttype(ecdf::ECDF) = weighttype(typeof(ecdf))

struct ECDF{T <: AbstractVector{<:Real}, W <: AbstractWeights{<:Real}}
sorted_values::T
weights::W
# whether to interpolate between the neighboring ECDF values
isinterpolating(::Type{<:ECDF{<:Any, <:Any, I}}) where I = I
isinterpolating(ecdf::ECDF) = isinterpolating(typeof(ecdf))

function Base.show(io::IO, e::ECDF)
print(io, typeof(e))
print(io, "(")
print(io, length(e.sorted_values))
println(io, " values)")
Comment on lines +35 to +37
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
print(io, "(")
print(io, length(e.sorted_values))
println(io, " values)")
print(io, "(")
print(io, length(e.sorted_values))
println(io, " values)")

To avoid making it look as if values was and argument to the constructor:

Suggested change
print(io, "(")
print(io, length(e.sorted_values))
println(io, " values)")
print(io, " with ")
print(io, length(e.sorted_values))
println(io, " values")

Also maybe "points" is better than "values"? I.e. it's not the number of values in the input but the number of unique values, right?

end

# calculate ECDF at given point
function (ecdf::ECDF)(x::Real)
isnan(x) && return NaN
n = searchsortedlast(ecdf.sorted_values, x)
evenweights = isempty(ecdf.weights)
weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights)
partialsum = evenweights ? n : sum(view(ecdf.weights, 1:n))
partialsum / weightsum
pos = searchsortedlast(ecdf.sorted_values, x, by=first)
(pos == 0) && return zero(weighttype(ecdf))
@inbounds val, cdf_val, val_invdelta, cdf_delta = ecdf.sorted_values[pos]
if !isinterpolating(ecdf) || (val == x) || (pos == length(ecdf.sorted_values))
return cdf_val
else
return muladd(cdf_delta, min((x - val) * val_invdelta, one(weighttype(ecdf))), cdf_val)
end
end

function (ecdf::ECDF)(v::RealVector)
evenweights = isempty(ecdf.weights)
weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights)
ord = sortperm(v)
m = length(v)
r = similar(ecdf.sorted_values, m)
r0 = zero(weightsum)
i = 1
for (j, x) in enumerate(ecdf.sorted_values)
while i <= m && x > v[ord[i]]
r[ord[i]] = r0
i += 1
end
r0 += evenweights ? 1 : ecdf.weights[j]
if i > m
break
end
end
while i <= m
r[ord[i]] = weightsum
i += 1
Comment on lines -19 to -39
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So all this code was redundant with calling ecdf on each point in a loop? :-/

# broadcasts ecdf() over an array
# caches the last calculated value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean by "caches"?

function Base.Broadcast.broadcasted(ecdf::ECDF, v::AbstractArray)
res = similar(v, weighttype(ecdf))
@inbounds for i in eachindex(v)
res[i] = i == 1 || v[i] != v[i-1] ? ecdf(v[i]) : res[i-1]
end
return r / weightsum
return res
end

"""
ecdf(X; weights::AbstractWeights)
ecdf(X::RealVector; weights::AbstractWeights=nothing, interpolate::Bool=false) -> ECDF

Construct an *empirical cumulative distribution function* (ECDF) based on a vector of samples (`X`).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Construct an *empirical cumulative distribution function* (ECDF) based on a vector of samples (`X`).
Construct an *empirical cumulative distribution function* (ECDF) based on vector of samples `X`.

Returns a callable [`ECDF`](@ref) object.

Return an empirical cumulative distribution function (ECDF) based on a vector of samples
given in `X`. Optionally providing `weights` returns a weighted ECDF.
# Example

Note: this function that returns a callable composite type, which can then be applied to
evaluate CDF values on other samples.
Comment on lines -50 to -51
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to keep a mention about the fact that one needs to use broadcast here (even if that's also in examples), as it's not totally obvious.

```jldoctest
julia> using StatsBase

`extrema`, `minimum`, and `maximum` are supported to for obtaining the range over which
function is inside the interval ``(0,1)``; the function is defined for the whole real line.
julia> X = 1:15
1:15

julia> xcdf = ecdf(X)
ECDF{Int64,Float64,false}(15 values)

julia> xcdf(9)
0.6
```

## Keyword arguments
* `weights`: optional weights for each `X` sample
* `interpolate`: when enabled, `ECDF(y)` will linearly interpolate the cumulative density between
the neighboring `X` samples (``x_i \\leq y < x_{i+1}``),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
the neighboring `X` samples (``x_i \\leq y < x_{i+1}``),
the neighboring values in `X` (``x_i \\leq y < x_{i+1}`` with ``x`` sorted values in `X`),

otherwise (by default) `ECDF(x[i])` is returned

## Notes
The returned function is defined for the whole real line.
Use [`extrema`](@ref), [`minimum`](@ref), and [`maximum`](@ref) to obtain the range
over which the ECDF is inside the interval ``(0, 1)``.
"""
function ecdf(X::RealVector; weights::AbstractVector{<:Real}=Weights(Float64[]))
function ecdf(X::RealVector;
weights::Union{Nothing, RealVector}=nothing,
interpolate::Bool=false)
any(isnan, X) && throw(ArgumentError("ecdf can not include NaN values"))
isempty(weights) || length(X) == length(weights) || throw(ArgumentError("data and weight vectors must be the same size," *
"got $(length(X)) and $(length(weights))"))
evenweights = isnothing(weights) || isempty(weights)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inference works better with ===:

Suggested change
evenweights = isnothing(weights) || isempty(weights)
evenweights = weights === nothing || isempty(weights)

evenweights || (length(X) == length(weights)) ||
throw(ArgumentError("data and weight vectors must be the same size, " *
"got $(length(X)) and $(length(weights))"))
T = eltype(X)
W0 = evenweights ? Int : eltype(weights)
W = isnothing(weights) ? Float64 : eltype(one(W0)/sum(weights))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Store sum(weights) to avoid calling it twice?


wsum = evenweights ? length(X) : sum(weights)
ord = sortperm(X)
ECDF(X[ord], isempty(weights) ? weights : Weights(weights[ord]))

sorted_vals = sizehint!(Vector{Tuple{T, W, W, W}}(), length(X))
isempty(X) && return ECDF{T, W, interpolate}(sorted_vals)

valprev = val = X[first(ord)]
wsumprev = zero(W0)
valw = zero(W0)

push_valprev!() = push!(sorted_vals, (valprev, min(wsumprev/wsum, one(W)),
inv(val - valprev), valw/wsum))

@inbounds for i in ord
valnew = X[i]
if (val != valnew) || (i == last(ord))
(wsumprev > 0) && push_valprev!()
Comment on lines +117 to +123
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this anonymous function trigger boxing of these variables, making the loop quite slow?

valprev = val
val = valnew
wsumprev += valw
valw = zero(W0)
end
valw += evenweights ? one(W0) : weights[i]
end
#@assert valw + wsumprev == wsum # may fail due to fp-arithmetic
(wsumprev > 0) && push_valprev!()
# last value
push!(sorted_vals, (val, one(W), zero(W), zero(W)))
return ECDF{T,W,interpolate}(sorted_vals)
end

minimum(ecdf::ECDF) = first(ecdf.sorted_values)
minimum(ecdf::ECDF) = first(ecdf.sorted_values)[1]

maximum(ecdf::ECDF) = last(ecdf.sorted_values)
maximum(ecdf::ECDF) = last(ecdf.sorted_values)[1]

extrema(ecdf::ECDF) = (minimum(ecdf), maximum(ecdf))
4 changes: 2 additions & 2 deletions src/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ aweights(vs::RealArray) = AnalyticWeights(vec(vs))
s = w.sum

if corrected
sum_sn = sum(x -> (x / s) ^ 2, w)
1 / (s * (1 - sum_sn))
sum_w2 = sum(abs2, w)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wasn't this here to avoid overflow?

1 / (s - sum_w2 / s)
else
1 / s
end
Expand Down
25 changes: 22 additions & 3 deletions test/empirical.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
using StatsBase
using Test

function showstring(obj)
iobuf = IOBuffer()
show(iobuf, obj)
return String(take!(iobuf))
end

@testset "ECDF" begin
x = randn(10000000)
fnecdf = ecdf(x)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd change all of these to ecdf.(x), and copy this to deprecated.jl.

Expand All @@ -10,10 +16,15 @@ using Test
@test fnecdf(y) ≈ map(fnecdf, y)
@test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == extrema(x)
fnecdf = ecdf([0.5])
@test showstring(fnecdf) == "ECDF{Float64,Float64,false}(1 values)\n"
@test fnecdf(0.5) == 1.0
@test fnecdf([zeros(5000); ones(5000)]) == [zeros(5000); ones(5000)]
@test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 0.5)
@test isnan(ecdf([1,2,3])(NaN))
@test_throws ArgumentError ecdf([1, NaN])
fnecdf = ecdf(Int[])
@test showstring(fnecdf) == "ECDF{$Int,Float64,false}(0 values)\n"
@test fnecdf.([0, 1, 2, 3]) == [0.0, 0.0, 0.0, 0.0]
end

@testset "Weighted ECDF" begin
Expand All @@ -23,17 +34,25 @@ end
fnecdf = ecdf(x, weights=w1)
fnecdfalt = ecdf(x, weights=w2)
@test fnecdf.sorted_values == fnecdfalt.sorted_values
@test fnecdf.weights == fnecdfalt.weights
@test fnecdf.weights != w1 # check that w wasn't accidently modified in place
@test fnecdfalt.weights != w2
@test_skip fnecdf.weights == fnecdfalt.weights
@test_skip fnecdf.weights != w1 # check that w wasn't accidently modified in place
@test_skip fnecdfalt.weights != w2
Comment on lines +37 to +39
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

y = [-1.96, -1.644854, -1.281552, -0.6744898, 0, 0.6744898, 1.281552, 1.644854, 1.96]
@test isapprox(fnecdf(y), [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975], atol=1e-3)
@test isapprox(fnecdf(1.96), 0.975, atol=1e-3)
@test fnecdf(y) ≈ map(fnecdf, y)
@test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == extrema(x)
fnecdf = ecdf([1.0, 0.5], weights=weights([3, 1]))
@test fnecdf.([0.0, 0.5, 0.75, 1.0, 2.0]) == [0.0, 0.25, 0.25, 1.0, 1.0]
@test fnecdf(0.75) == 0.25
@test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 1.0)

fnecdfi = ecdf([1.0, 0.5], weights=weights([3, 1]), interpolate=true)
@test showstring(fnecdfi) == "ECDF{Float64,Float64,true}(2 values)\n"
@test fnecdfi.([0.0, 0.5, 0.75, 1.0, 2.0]) == [0.0, 0.25, 0.625, 1.0, 1.0]
@test fnecdfi(0.75) == 0.625
@test extrema(fnecdfi) == (minimum(fnecdfi), maximum(fnecdfi)) == (0.5, 1.0)

@test_throws ArgumentError ecdf(rand(8), weights=weights(rand(10)))
# Check frequency weights
v = randn(100)
Expand Down