JuliaStats · alyst · Aug 8, 2020 · Aug 9, 2020 · Aug 9, 2020 · Aug 9, 2020
diff --git a/src/common.jl b/src/common.jl
@@ -24,11 +24,11 @@ const RealFP = Union{Float32, Float64}
 const DepBool = Union{Bool, Nothing}
 
 function depcheck(fname::Symbol, b::DepBool)
-    if b == nothing
+    if b === nothing
         msg = "$fname will default to corrected=true in the future. Use corrected=false for previous behaviour."
         Base.depwarn(msg, fname)
-        false
+        return false
     else
-        b
+        return b
     end
 end
diff --git a/src/deprecates.jl b/src/deprecates.jl
@@ -39,3 +39,9 @@ end
 ### Deprecated September 2019
 @deprecate sum(A::AbstractArray, w::AbstractWeights, dims::Int) sum(A, w, dims=dims)
 @deprecate values(wv::AbstractWeights) convert(Vector, wv)
+
+### Deprecate August 2020 (v0.33)
+function (ecdf::ECDF)(v::RealArray)
+    depwarn("(ecdf::ECDF)(v::RealArray) is deprecated, use `ecdf.(v)` broadcasting instead", "(ECDF::ecdf)(v::RealArray)")
-    depwarn("(ecdf::ECDF)(v::RealArray) is deprecated, use `ecdf.(v)` broadcasting instead", "(ECDF::ecdf)(v::RealArray)")
+    depwarn("(ecdf::ECDF)(v::RealArray) is deprecated, use `ecdf.(v)` broadcasting instead", :ecdf)
-    depwarn("(ecdf::ECDF)(v::RealArray) is deprecated, use `ecdf.(v)` broadcasting instead", "(ECDF::ecdf)(v::RealArray)")
+    depwarn("(ecdf::ECDF)(v::RealArray) is deprecated, use `ecdf.(v)` broadcasting instead", :ecdf)
+    return ecdf.(v)
+end
diff --git a/src/empirical.jl b/src/empirical.jl
@@ -1,68 +1,142 @@
 # Empirical estimation of CDF and PDF
 
-## Empirical CDF
+"""
+Empirical Cumulative Distribution Function (ECDF).
-Empirical Cumulative Distribution Function (ECDF).
+    ECDF{T <: Real, W <: Real, I}
+
+Empirical Cumulative Distribution Function (ECDF).
-Empirical Cumulative Distribution Function (ECDF).
+    ECDF{T <: Real, W <: Real, I}
+
+Empirical Cumulative Distribution Function (ECDF).
+
+Represents ECDF constructed from random variable samples by [`ecdf`](@ref).
+
+# Type parameters
+* `T`: the type of random variable values
+* `W`: the type of sample weights and CDF values
+* `I`: boolean indicating whether to interpolate
+  the CDF between neighboring samples (continuous distribution)
+  or not (discrete distribution)
+"""
+struct ECDF{T <: Real, W <: Real, I}
+    # sorted random variable values and associated probabilities.
+    # The tuple provides:
+    #  - `x[i]` (value of random variable)
+    #  - `ECDF(x[i])`
+    #  - `1/(x[i+1] - x[i])`
+    #  - `ECDF(x[i+1]) - ECDF(x[i])` (the weight of `x[i+1]`)
+    sorted_values::Vector{Tuple{T, W, W, W}}
+end
+
+# type of value weights and CDF values
+weighttype(::Type{<:ECDF{<:Any, W}}) where W = W
+weighttype(ecdf::ECDF) = weighttype(typeof(ecdf))
 
-struct ECDF{T <: AbstractVector{<:Real}, W <: AbstractWeights{<:Real}}
-    sorted_values::T
-    weights::W
+# whether to interpolate between the neighboring ECDF values
+isinterpolating(::Type{<:ECDF{<:Any, <:Any, I}}) where I = I
+isinterpolating(ecdf::ECDF) = isinterpolating(typeof(ecdf))
+
+function Base.show(io::IO, e::ECDF)
+    print(io, typeof(e))
+    print(io, "(")
+    print(io, length(e.sorted_values))
+    println(io, " values)")
-    print(io, "(")
-    print(io, length(e.sorted_values))
-    println(io, " values)")
+    print(io, "(")
+    print(io, length(e.sorted_values))
+    println(io, " values)")
-    print(io, "(")
-    print(io, length(e.sorted_values))
-    println(io, " values)")
+    print(io, " with ")
+    print(io, length(e.sorted_values))
+    println(io, " values")
-    print(io, "(")
-    print(io, length(e.sorted_values))
-    println(io, " values)")
+    print(io, "(")
+    print(io, length(e.sorted_values))
+    println(io, " values)")
-    print(io, "(")
-    print(io, length(e.sorted_values))
-    println(io, " values)")
+    print(io, " with ")
+    print(io, length(e.sorted_values))
+    println(io, " values")
 end
 
+# calculate ECDF at given point
 function (ecdf::ECDF)(x::Real)
     isnan(x) && return NaN
-    n = searchsortedlast(ecdf.sorted_values, x)
-    evenweights = isempty(ecdf.weights)
-    weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights)
-    partialsum = evenweights ? n : sum(view(ecdf.weights, 1:n))
-    partialsum / weightsum
+    pos = searchsortedlast(ecdf.sorted_values, x, by=first)
+    (pos == 0) && return zero(weighttype(ecdf))
+    @inbounds val, cdf_val, val_invdelta, cdf_delta = ecdf.sorted_values[pos]
+    if !isinterpolating(ecdf) || (val == x) || (pos == length(ecdf.sorted_values))
+        return cdf_val
+    else
+        return muladd(cdf_delta, min((x - val) * val_invdelta, one(weighttype(ecdf))), cdf_val)
+    end
 end
 
-function (ecdf::ECDF)(v::RealVector)
-    evenweights = isempty(ecdf.weights)
-    weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights)
-    ord = sortperm(v)
-    m = length(v)
-    r = similar(ecdf.sorted_values, m)
-    r0 = zero(weightsum)
-    i = 1
-    for (j, x) in enumerate(ecdf.sorted_values)
-        while i <= m && x > v[ord[i]]
-            r[ord[i]] = r0
-            i += 1
-        end
-        r0 += evenweights ? 1 : ecdf.weights[j]
-        if i > m
-            break
-        end
-    end
-    while i <= m
-        r[ord[i]] = weightsum
-        i += 1
+# broadcasts ecdf() over an array
+# caches the last calculated value
+function Base.Broadcast.broadcasted(ecdf::ECDF, v::AbstractArray)
+    res = similar(v, weighttype(ecdf))
+    @inbounds for i in eachindex(v)
+        res[i] = i == 1 || v[i] != v[i-1] ? ecdf(v[i]) : res[i-1]
     end
-    return r / weightsum
+    return res
 end
 
 """
-    ecdf(X; weights::AbstractWeights)
+    ecdf(X::RealVector; weights::AbstractWeights=nothing, interpolate::Bool=false) -> ECDF
+
+Construct an *empirical cumulative distribution function* (ECDF) based on a vector of samples (`X`).
-Construct an *empirical cumulative distribution function* (ECDF) based on a vector of samples (`X`).
+Construct an *empirical cumulative distribution function* (ECDF) based on vector of samples `X`.
-Construct an *empirical cumulative distribution function* (ECDF) based on a vector of samples (`X`).
+Construct an *empirical cumulative distribution function* (ECDF) based on vector of samples `X`.
+Returns a callable [`ECDF`](@ref) object.
 
-Return an empirical cumulative distribution function (ECDF) based on a vector of samples
-given in `X`. Optionally providing `weights` returns a weighted ECDF.
+# Example
 
-Note: this function that returns a callable composite type, which can then be applied to
-evaluate CDF values on other samples.
+```jldoctest
+julia> using StatsBase
 
-`extrema`, `minimum`, and `maximum` are supported to for obtaining the range over which
-function is inside the interval ``(0,1)``; the function is defined for the whole real line.
+julia> X = 1:15
+1:15
+
+julia> xcdf = ecdf(X)
+ECDF{Int64,Float64,false}(15 values)
+
+julia> xcdf(9)
+0.6
+```
+
+## Keyword arguments
+* `weights`: optional weights for each `X` sample
+* `interpolate`: when enabled, `ECDF(y)` will linearly interpolate the cumulative density between
+  the neighboring `X` samples (``x_i \\leq y < x_{i+1}``),
-  the neighboring `X` samples (``x_i \\leq y < x_{i+1}``),
+  the neighboring values in `X` (``x_i \\leq y < x_{i+1}`` with ``x`` sorted values in `X`),
-  the neighboring `X` samples (``x_i \\leq y < x_{i+1}``),
+  the neighboring values in `X` (``x_i \\leq y < x_{i+1}`` with ``x`` sorted values in `X`),
+  otherwise (by default) `ECDF(x[i])` is returned
+
+## Notes
+The returned function is defined for the whole real line.
+Use [`extrema`](@ref), [`minimum`](@ref), and [`maximum`](@ref) to obtain the range
+over which the ECDF is inside the interval ``(0, 1)``.
 """
-function ecdf(X::RealVector; weights::AbstractVector{<:Real}=Weights(Float64[]))
+function ecdf(X::RealVector;
+              weights::Union{Nothing, RealVector}=nothing,
+              interpolate::Bool=false)
     any(isnan, X) && throw(ArgumentError("ecdf can not include NaN values"))
-    isempty(weights) || length(X) == length(weights) || throw(ArgumentError("data and weight vectors must be the same size," *
-        "got $(length(X)) and $(length(weights))"))
+    evenweights = isnothing(weights) || isempty(weights)
-    evenweights = isnothing(weights) || isempty(weights)
+    evenweights = weights === nothing || isempty(weights)
-    evenweights = isnothing(weights) || isempty(weights)
+    evenweights = weights === nothing || isempty(weights)
+    evenweights || (length(X) == length(weights)) ||
+        throw(ArgumentError("data and weight vectors must be the same size, " *
+                            "got $(length(X)) and $(length(weights))"))
+    T = eltype(X)
+    W0 = evenweights ? Int : eltype(weights)
+    W = isnothing(weights) ? Float64 : eltype(one(W0)/sum(weights))
+
+    wsum = evenweights ? length(X) : sum(weights)
     ord = sortperm(X)
-    ECDF(X[ord], isempty(weights) ? weights : Weights(weights[ord]))
+
+    sorted_vals = sizehint!(Vector{Tuple{T, W, W, W}}(), length(X))
+    isempty(X) && return ECDF{T, W, interpolate}(sorted_vals)
+
+    valprev = val = X[first(ord)]
+    wsumprev = zero(W0)
+    valw = zero(W0)
+
+    push_valprev!() = push!(sorted_vals, (valprev, min(wsumprev/wsum, one(W)),
+                                          inv(val - valprev), valw/wsum))
+
+    @inbounds for i in ord
+        valnew = X[i]
+        if (val != valnew) || (i == last(ord))
+            (wsumprev > 0) && push_valprev!()
+            valprev = val
+            val = valnew
+            wsumprev += valw
+            valw = zero(W0)
+        end
+        valw += evenweights ? one(W0) : weights[i]
+    end
+    #@assert valw + wsumprev == wsum # may fail due to fp-arithmetic
+    (wsumprev > 0) && push_valprev!()
+    # last value
+    push!(sorted_vals, (val, one(W), zero(W), zero(W)))
+    return ECDF{T,W,interpolate}(sorted_vals)
 end
 
-minimum(ecdf::ECDF) = first(ecdf.sorted_values)
+minimum(ecdf::ECDF) = first(ecdf.sorted_values)[1]
 
-maximum(ecdf::ECDF) = last(ecdf.sorted_values)
+maximum(ecdf::ECDF) = last(ecdf.sorted_values)[1]
 
 extrema(ecdf::ECDF) = (minimum(ecdf), maximum(ecdf))
diff --git a/src/weights.jl b/src/weights.jl
@@ -121,8 +121,8 @@ aweights(vs::RealArray) = AnalyticWeights(vec(vs))
     s = w.sum
 
     if corrected
-        sum_sn = sum(x -> (x / s) ^ 2, w)
-        1 / (s * (1 - sum_sn))
+        sum_w2 = sum(abs2, w)
+        1 / (s - sum_w2 / s)
     else
         1 / s
     end

diff --git a/test/empirical.jl b/test/empirical.jl
@@ -1,6 +1,12 @@
 using StatsBase
 using Test
 
+function showstring(obj)
+    iobuf = IOBuffer()
+    show(iobuf, obj)
+    return String(take!(iobuf))
+end
+
 @testset "ECDF" begin
     x = randn(10000000)
     fnecdf = ecdf(x)
@@ -10,10 +16,15 @@ using Test
     @test fnecdf(y) ≈ map(fnecdf, y)
     @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == extrema(x)
     fnecdf = ecdf([0.5])
+    @test showstring(fnecdf) == "ECDF{Float64,Float64,false}(1 values)\n"
+    @test fnecdf(0.5) == 1.0
     @test fnecdf([zeros(5000); ones(5000)]) == [zeros(5000); ones(5000)]
     @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 0.5)
     @test isnan(ecdf([1,2,3])(NaN))
     @test_throws ArgumentError ecdf([1, NaN])
+    fnecdf = ecdf(Int[])
+    @test showstring(fnecdf) == "ECDF{$Int,Float64,false}(0 values)\n"
+    @test fnecdf.([0, 1, 2, 3]) == [0.0, 0.0, 0.0, 0.0]
 end
 
 @testset "Weighted ECDF" begin
@@ -23,17 +34,25 @@ end
     fnecdf = ecdf(x, weights=w1)
     fnecdfalt = ecdf(x, weights=w2)
     @test fnecdf.sorted_values == fnecdfalt.sorted_values
-    @test fnecdf.weights == fnecdfalt.weights
-    @test fnecdf.weights != w1  #  check that w wasn't accidently modified in place
-    @test fnecdfalt.weights != w2
+    @test_skip fnecdf.weights == fnecdfalt.weights
+    @test_skip fnecdf.weights != w1  #  check that w wasn't accidently modified in place
+    @test_skip fnecdfalt.weights != w2
     y = [-1.96, -1.644854, -1.281552, -0.6744898, 0, 0.6744898, 1.281552, 1.644854, 1.96]
     @test isapprox(fnecdf(y), [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975], atol=1e-3)
     @test isapprox(fnecdf(1.96), 0.975, atol=1e-3)
     @test fnecdf(y) ≈ map(fnecdf, y)
     @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == extrema(x)
     fnecdf = ecdf([1.0, 0.5], weights=weights([3, 1]))
+    @test fnecdf.([0.0, 0.5, 0.75, 1.0, 2.0]) == [0.0, 0.25, 0.25, 1.0, 1.0]
     @test fnecdf(0.75) == 0.25
     @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 1.0)
+
+    fnecdfi = ecdf([1.0, 0.5], weights=weights([3, 1]), interpolate=true)
+    @test showstring(fnecdfi) == "ECDF{Float64,Float64,true}(2 values)\n"
+    @test fnecdfi.([0.0, 0.5, 0.75, 1.0, 2.0]) == [0.0, 0.25, 0.625, 1.0, 1.0]
+    @test fnecdfi(0.75) == 0.625
+    @test extrema(fnecdfi) == (minimum(fnecdfi), maximum(fnecdfi)) == (0.5, 1.0)
+
     @test_throws ArgumentError ecdf(rand(8), weights=weights(rand(10)))
     #  Check frequency weights
     v = randn(100)