diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 91fbcded..a707beda 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -29,3 +29,22 @@ SUITE["cutree"] = BenchmarkGroup() for (n, k) in ((10, 3), (100, 10), (1000, 100), (10000, 1000)) SUITE["cutree"][(n,k)] = @benchmarkable cutree(hclu, k=$k) setup=(D=random_distance_matrix($n, 5); hclu=hclust(D, linkage=:single)) end + +function silhouette_benchmark(metric, assgns, points, nclusters) + res = BenchmarkGroup() + res[:distances] = @benchmarkable silhouettes($assgns, pairwise($metric, $points, $points, dims=2)) + res[:points] = @benchmarkable silhouettes($assgns, $points; metric=$metric) + return res +end + +SUITE["silhouette"] = BenchmarkGroup() +for metric in [SqEuclidean(), Euclidean()] + SUITE["silhouette"]["metric=$(typeof(metric))"] = metric_bench = BenchmarkGroup() + for n in [100, 1000, 10000, 20000] + nclusters = 10 + dims = 10 + points = randn(dims, n) + assgns = rand(1:nclusters, n) + metric_bench["n=$n"] = silhouette_benchmark(metric, assgns, points, nclusters) + end +end \ No newline at end of file diff --git a/src/Clustering.jl b/src/Clustering.jl index bb7f44cd..808b37b2 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -83,6 +83,7 @@ module Clustering include("fuzzycmeans.jl") include("counts.jl") + include("cluster_distances.jl") include("silhouette.jl") include("randindex.jl") include("varinfo.jl") diff --git a/src/cluster_distances.jl b/src/cluster_distances.jl new file mode 100644 index 00000000..326aae35 --- /dev/null +++ b/src/cluster_distances.jl @@ -0,0 +1,226 @@ +""" +Base type for efficient computation of average(mean) distances +from the cluster centers to a given point. + +The descendant types should implement the following methods: + * `update!(dists, assignments, points)`: update the internal + state of `dists` with point coordinates and their assignments to the clusters + * `sumdistances(dists, points, indices)`: compute the sum of + distances from all `dists` clusters to `points` +""" +abstract type ClusterDistances{T} end + +# create empty ClusterDistances object for a given metric +# and update it with a given clustering +# if batch_size is specified, the updates are done in point batches of given size +function ClusterDistances(metric::SemiMetric, + assignments::AbstractVector{<:Integer}, + points::AbstractMatrix{<:Real}, + batch_size::Union{Integer, Nothing} = nothing) + update!(ClusterDistances(eltype(points), metric, length(assignments), size(points, 1), + maximum(assignments)), + assignments, points, batch_size) +end + +ClusterDistances(metric, R::ClusteringResult, args...) = + ClusterDistances(metric, assignments(R), args...) + +# fallback implementations of ClusteringDistances methods + +cluster_sizes(dists::ClusterDistances) = dists.cluster_sizes +nclusters(dists::ClusterDistances) = length(cluster_sizes(dists)) + +update!(dists::ClusterDistances, + assignments::AbstractVector, points::AbstractMatrix) = + error("update!(dists::$(typeof(dists))) not implemented") + +sumdistances(dists::ClusterDistances, + points::Union{AbstractMatrix, Nothing}, + indices::Union{AbstractVector{<:Integer}, Nothing}) = + error("sumdistances(dists::$(typeof(dists))) not implemented") + +# average distances from each cluster to each point, nclusters×n matrix +function meandistances(dists::ClusterDistances, + assignments::AbstractVector{<:Integer}, + points::Union{AbstractMatrix, Nothing}, + indices::AbstractVector{<:Integer}) + @assert length(assignments) == length(indices) + (points === nothing) || @assert(size(points, 2) == length(indices)) + clu_to_pt = sumdistances(dists, points, indices) + clu_sizes = cluster_sizes(dists) + @assert length(assignments) == length(indices) + @assert size(clu_to_pt) == (length(clu_sizes), length(assignments)) + + # normalize distances by cluster sizes + @inbounds for j in eachindex(assignments) + for (i, c) in enumerate(clu_sizes) + if i == assignments[j] + c -= 1 + end + if c == 0 + clu_to_pt[i,j] = 0 + else + clu_to_pt[i,j] /= c + end + end + end + return clu_to_pt +end + +# wrapper for ClusteringResult +update!(dists::ClusterDistances, R::ClusteringResult, args...) = + update!(dists, assignments(R), args...) + +# batch-update silhouette dists (splitting the points into chunks of batch_size size) +function update!(dists::ClusterDistances, + assignments::AbstractVector{<:Integer}, points::AbstractMatrix{<:Real}, + batch_size::Union{Integer, Nothing}) + n = size(points, 2) + ((batch_size === nothing) || (n <= batch_size)) && + return update!(dists, assignments, points) + + for batch_start in 1:batch_size:n + batch_ixs = batch_start:min(batch_start + batch_size - 1, n) + update!(dists, view(assignments, batch_ixs), view(points, :, batch_ixs)) + end + return dists +end + +# generic ClusterDistances implementation for an arbitrary metric M +# if M is Nothing, point_dists is an arbitrary matrix of point distances +struct SimpleClusterDistances{M, T} <: ClusterDistances{T} + metric::M + cluster_sizes::Vector{Int} + assignments::Vector{Int} + point_dists::Matrix{T} + + SimpleClusterDistances(::Type{T}, metric::M, + npoints::Integer, nclusters::Integer) where {M<:Union{SemiMetric, Nothing}, T<:Real} = + new{M, T}(metric, zeros(Int, nclusters), Vector{Int}(), + Matrix{T}(undef, npoints, npoints)) + + # reuse given points matrix + function SimpleClusterDistances( + metric::Nothing, + assignments::AbstractVector{<:Integer}, + point_dists::AbstractMatrix{T} + ) where T<:Real + n = length(assignments) + size(point_dists) == (n, n) || throw(DimensionMismatch("assignments length ($n) does not match distances matrix size ($(size(point_dists)))")) + issymmetric(point_dists) || throw(ArgumentError("point distances matrix must be symmetric")) + clu_sizes = zeros(Int, maximum(assignments)) + @inbounds for cluster in assignments + clu_sizes[cluster] += 1 + end + new{Nothing, T}(metric, clu_sizes, assignments, point_dists) + end +end + +# fallback ClusterDistances constructor +ClusterDistances(::Type{T}, metric::Union{SemiMetric, Nothing}, + npoints::Union{Integer, Nothing}, dims::Integer, nclusters::Integer) where T<:Real = + SimpleClusterDistances(T, metric, npoints, nclusters) + +# when metric is nothing, points is the matrix of distances +function ClusterDistances(metric::Nothing, + assignments::AbstractVector{<:Integer}, + points::AbstractMatrix, + batch_size::Union{Integer, Nothing} = nothing) + (batch_size === nothing) || (size(points, 2) > batch_size) || + error("batch-updates of distance matrix-based ClusterDistances not supported") + SimpleClusterDistances(metric, assignments, points) +end + +function update!(dists::SimpleClusterDistances{M}, + assignments::AbstractVector{<:Integer}, + points::AbstractMatrix{<:Real}) where M + @assert length(assignments) == size(points, 2) + check_assignments(assignments, nclusters(dists)) + append!(dists.assignments, assignments) + n = size(dists.point_dists, 1) + length(dists.assignments) == n || + error("$(typeof(dists)) does not support batch updates: $(length(assignments)) points given, $n expected") + @inbounds for cluster in assignments + dists.cluster_sizes[cluster] += 1 + end + + if M === Nothing + size(point_dists) == (n, n) || + throw(DimensionMismatch("points should be a point-to-point distances matrix of ($n, $n) size, $(size(points)) given")) + copy!(dists.point_dists, point_dists) + else + # metric-based SimpleClusterDistances does not support batched updates + size(points, 2) == n || + throw(DimensionMismatch("points should be a point coordinates matrix with $n columns, $(size(points, 2)) found")) + pairwise!(dists.metric, dists.point_dists, points, dims=2) + end + + return dists +end + +# this function returns matrix r nclusters x n, such that +# r[i, j] is the sum of distances from all i-th cluster points to the point indices[j] +function sumdistances(dists::SimpleClusterDistances, + points::Union{AbstractMatrix, Nothing}, # unused as distances are already in point_dists + indices::AbstractVector{<:Integer}) + T = eltype(dists.point_dists) + n = length(dists.assignments) + S = typeof((one(T)+one(T))/2) + r = zeros(S, nclusters(dists), n) + @inbounds for (jj, j) in enumerate(indices) + for i = 1:j-1 + r[dists.assignments[i], jj] += dists.point_dists[i,j] + end + for i = j+1:n + r[dists.assignments[i], jj] += dists.point_dists[i,j] + end + end + return r +end + +# uses the method from "Distributed Silhouette Algorithm: Evaluating Clustering on Big Data" +# https://arxiv.org/abs/2303.14102 +# for SqEuclidean point distances +struct SqEuclideanClusterDistances{T} <: ClusterDistances{T} + cluster_sizes::Vector{Int} # [nclusters] + Y::Matrix{T} # [dims, nclusters], the first moments of each cluster (sum of point coords) + Ψ::Vector{T} # [nclusters], the second moments of each cluster (sum of point coord squares) + + SqEuclideanClusterDistances(::Type{T}, npoints::Union{Integer, Nothing}, dims::Integer, + nclusters::Integer) where T<:Real = + new{T}(zeros(Int, nclusters), zeros(T, dims, nclusters), zeros(T, nclusters)) +end + +ClusterDistances(::Type{T}, metric::SqEuclidean, npoints::Union{Integer, Nothing}, + dims::Integer, nclusters::Integer) where T<:Real = + SqEuclideanClusterDistances(T, npoints, dims, nclusters) + +function update!(dists::SqEuclideanClusterDistances, + assignments::AbstractVector{<:Integer}, + points::AbstractMatrix{<:Real}) + # x dims are [D,N] + d, n = size(points) + k = length(cluster_sizes(dists)) + check_assignments(assignments, k) + n == length(assignments) || throw(DimensionMismatch("points count ($n) does not match assignments length $(length(assignments)))")) + d == size(dists.Y, 1) || throw(DimensionMismatch("points dims ($(size(points, 1))) do no must match ClusterDistances dims ($(size(dists.Y, 1)))")) + # precompute moments and update counts + @inbounds for (i, cluster) in enumerate(assignments) + point = view(points, :, i) # switch to eachslice() once Julia-1.0 support is dropped + dists.cluster_sizes[cluster] += 1 + dists.Y[:, cluster] .+= point + dists.Ψ[cluster] += sum(abs2, point) + end + return dists +end + +# sum distances from each cluster to each point in `points`, [nclusters, n] +function sumdistances(dists::SqEuclideanClusterDistances, + points::AbstractMatrix, + indices::AbstractVector{<:Integer}) + @assert size(points, 2) == length(indices) + point_norms = sum(abs2, points; dims=1) # [1,n] + return dists.cluster_sizes .* point_norms .+ + reshape(dists.Ψ, nclusters(dists), 1) .- + 2 * (transpose(dists.Y) * points) +end diff --git a/src/silhouette.jl b/src/silhouette.jl index fc9622a2..61600e62 100644 --- a/src/silhouette.jl +++ b/src/silhouette.jl @@ -1,90 +1,26 @@ # Silhouette -# this function returns r of size (k, n), such that -# r[i, j] is the sum of distances of all points from cluster i to point j -# -function sil_aggregate_dists(k::Int, a::AbstractVector{Int}, dists::AbstractMatrix{T}) where T<:Real - n = length(a) - S = typeof((one(T)+one(T))/2) - r = zeros(S, k, n) - @inbounds for j = 1:n - for i = 1:j-1 - r[a[i],j] += dists[i,j] - end - for i = j+1:n - r[a[i],j] += dists[i,j] - end - end - return r -end - - -""" - silhouettes(assignments::AbstractVector, [counts,] dists) -> Vector{Float64} - silhouettes(clustering::ClusteringResult, dists) -> Vector{Float64} - -Compute *silhouette* values for individual points w.r.t. given clustering. - -Returns the ``n``-length vector of silhouette values for each individual point. - -# Arguments - - `assignments::AbstractVector{Int}`: the vector of point assignments - (cluster indices) - - `counts::AbstractVector{Int}`: the optional vector of cluster sizes (how many - points assigned to each cluster; should match `assignments`) - - `clustering::ClusteringResult`: the output of some clustering method - - `dists::AbstractMatrix`: ``n×n`` matrix of pairwise distances between - the points - -# References -> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the -> Interpretation and Validation of Cluster Analysis*. Computational and -> Applied Mathematics. 20: 53–65. -""" -function silhouettes(assignments::AbstractVector{<:Integer}, - counts::AbstractVector{<:Integer}, - dists::AbstractMatrix{T}) where T<:Real - +# compute silhouette scores for each point given a matrix of cluster-to-point distances +function silhouettes_scores(clu_to_pt::AbstractMatrix{<:Real}, + assignments::AbstractVector{<:Integer}, + clu_sizes::AbstractVector{<:Integer}) n = length(assignments) - k = length(counts) - k >= 2 || throw(ArgumentError("silhouettes() not defined for the degenerated clustering with a single cluster.")) - for j = 1:n - (1 <= assignments[j] <= k) || throw(ArgumentError("Bad assignments[$j]=$(assignments[j]): should be in 1:$k range.")) - end - sum(counts) == n || throw(ArgumentError("Mismatch between assignments ($n) and counts (sum(counts)=$(sum(counts))).")) - size(dists) == (n, n) || throw(DimensionMismatch("The size of a distance matrix ($(size(dists))) doesn't match the length of assignment vector ($n).")) - - # compute average distance from each cluster to each point --> r - r = sil_aggregate_dists(k, assignments, dists) - # from sum to average - @inbounds for j = 1:n - for i = 1:k - c = counts[i] - if i == assignments[j] - c -= 1 - end - if c == 0 - r[i,j] = 0 - else - r[i,j] /= c - end - end - end + @assert size(clu_to_pt) == (length(clu_sizes), n) # compute a and b # a: average distance w.r.t. the assigned cluster # b: the minimum average distance w.r.t. other cluster - a = similar(r, n) - b = similar(r, n) - - for j = 1:n + a = similar(clu_to_pt, n) + b = similar(clu_to_pt, n) + nclusters = length(clu_sizes) + for j in 1:n l = assignments[j] - a[j] = r[l, j] + a[j] = clu_to_pt[l, j] v = typemax(eltype(b)) - @inbounds for i = 1:k - counts[i] == 0 && continue # skip empty clusters - rij = r[i,j] + @inbounds for i = 1:nclusters + clu_sizes[i] == 0 && continue # skip empty clusters + rij = clu_to_pt[i, j] if (i != l) && (rij < v) v = rij end @@ -95,25 +31,80 @@ function silhouettes(assignments::AbstractVector{<:Integer}, # compute silhouette score sil = a # reuse the memory of a for sil for j = 1:n - if counts[assignments[j]] == 1 + if clu_sizes[assignments[j]] == 1 sil[j] = 0 else #If both a[i] and b[i] are equal to 0 or Inf, silhouettes is defined as 0 @inbounds sil[j] = a[j] < b[j] ? 1 - a[j]/b[j] : a[j] > b[j] ? b[j]/a[j] - 1 : - zero(eltype(r)) + zero(eltype(sil)) end end return sil end -silhouettes(R::ClusteringResult, dists::AbstractMatrix) = - silhouettes(assignments(R), counts(R), dists) +# calculate silhouette scores (single batch) +silhouettes_batch(dists::ClusterDistances, + assignments::AbstractVector{<:Integer}, + points::Union{AbstractMatrix, Nothing}, + indices::AbstractVector{<:Integer}) = + silhouettes_scores(meandistances(dists, assignments, points, indices), + assignments, cluster_sizes(dists)) + +# batch-calculate silhouette scores (splitting the points into chunks of batch_size size) +function silhouettes(dists::ClusterDistances, + assignments::AbstractVector{<:Integer}, + points::Union{AbstractMatrix, Nothing}, + batch_size::Union{Integer, Nothing} = nothing) + n = length(assignments) + ((batch_size === nothing) || (n <= batch_size)) && + return silhouettes_batch(dists, assignments, points, eachindex(assignments)) -function silhouettes(assignments::AbstractVector{<:Integer}, dists::AbstractMatrix) - counts = fill(0, maximum(assignments)) - for a in assignments - counts[a] += 1 + return mapreduce(vcat, 1:batch_size:n) do batch_start + batch_ixs = batch_start:min(batch_start + batch_size - 1, n) + # copy points/assignments to speed up matrix and indexing operations + silhouettes_batch(dists, assignments[batch_ixs], + points !== nothing ? points[:, batch_ixs] : nothing, + batch_ixs) end - silhouettes(assignments, counts, dists) end + +""" + silhouettes(assignments::Union{AbstractVector, ClusteringResult}, point_dists::Matrix) -> Vector{Float64} + silhouettes(assignments::Union{AbstractVector, ClusteringResult}, points::Matrix; + metric::SemiMetric, [batch_size::Integer]) -> Vector{Float64} + +Compute *silhouette* values for individual points w.r.t. given clustering. + +Returns the ``n``-length vector of silhouette values for each individual point. + +# Arguments + - `assignments::Union{AbstractVector{Int}, ClusteringResult}`: the vector of point assignments + (cluster indices) + - `points::AbstractMatrix`: if metric is nothing it is an ``n×n`` matrix of pairwise distances between the points, + otherwise it is an ``d×n`` matrix of `d` dimensional clustered data points. + - `metric::Union{SemiMetric, Nothing}`: an instance of Distances Metric object or nothing, + indicating the distance metric used for calculating point distances. + - `batch_size::Union{Integer, Nothing}`: if integer is given, calculate silhouettes in batches + of `batch_size` points each, throws `DimensionMismatch` if batched calculation + is not supported by given `metric`. + +# References +> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the +> Interpretation and Validation of Cluster Analysis*. Computational and +> Applied Mathematics. 20: 53–65. +> Marco Gaido (2023). Distributed Silhouette Algorithm: Evaluating Clustering on Big Data +""" +function silhouettes(assignments::AbstractVector{<:Integer}, + points::AbstractMatrix; + metric::Union{SemiMetric, Nothing} = nothing, + batch_size::Union{Integer, Nothing} = nothing) + nclusters = maximum(assignments) + nclusters >= 2 || throw(ArgumentError("silhouettes() not defined for the degenerated clustering with a single cluster.")) + check_assignments(assignments, nclusters) + return silhouettes(ClusterDistances(metric, assignments, points, batch_size), + assignments, points, batch_size) +end + +silhouettes(R::ClusteringResult, points::AbstractMatrix; kwargs...) = + silhouettes(assignments(R), points; kwargs...) diff --git a/src/utils.jl b/src/utils.jl index a01db9dc..f65e4914 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -76,3 +76,10 @@ function updatemin!(r::AbstractArray, x::AbstractArray) end return r end + +function check_assignments(assignments::AbstractVector{<:Integer}, nclusters::Union{Integer, Nothing}) + nclu = nclusters === nothing ? maximum(assignments) : nclusters + for (j, c) in enumerate(assignments) + all(1 <= c <= nclu) || throw(ArgumentError("Bad assignments[$j]=$c: should be in 1:$nclu range.")) + end +end \ No newline at end of file diff --git a/test/silhouette.jl b/test/silhouette.jl index f3724c40..643cf779 100644 --- a/test/silhouette.jl +++ b/test/silhouette.jl @@ -11,29 +11,20 @@ local D = [0 1 2 3 @assert size(D) == (4, 4) -local a = [1, 1, 2, 2] -local c = [2, 2] - @testset "Input checks" begin - @test_skip silhouettes(a, [1, 1, 2], D) # should throw because cluster counts are inconsistent - @test_throws ArgumentError silhouettes(a, [3, 2], D) - @test_throws ArgumentError silhouettes([1, 1, 3, 2], [2, 2], D) - @test_throws DimensionMismatch silhouettes([1, 1, 2, 2, 2], [2, 3], D) + @test_throws DimensionMismatch silhouettes([1, 1, 3, 2], D[1:4, 1:3]) + @test_throws DimensionMismatch silhouettes([1, 1, 2, 2, 2], D) + @test_throws Exception silhouettes([1, 1, 2, 2, 2], D, batch_size=3) + D2 = copy(D) + D2[2, 3] = 4 + @test_throws ArgumentError silhouettes([1, 1, 2, 2], D2) end -@test @inferred(silhouettes(a, c, D)) ≈ [1.5/2.5, 0.5/1.5, 0.5/1.5, 1.5/2.5] -@test @inferred(silhouettes(a, c, convert(Matrix{Float32}, D))) isa AbstractVector{Float32} -@test silhouettes(a, D) == silhouettes(a, c, D) # c is optional - -a = [1, 2, 1, 2] -c = [2, 2] - -@test silhouettes(a, c, D) ≈ [0.0, -0.5, -0.5, 0.0] - -a = [1, 1, 1, 2] -c = [3, 1] +@test @inferred(silhouettes([1, 1, 2, 2], D)) ≈ [1.5/2.5, 0.5/1.5, 0.5/1.5, 1.5/2.5] +@test @inferred(silhouettes([1, 1, 2, 2], convert(Matrix{Float32}, D))) isa AbstractVector{Float32} -@test silhouettes(a, c, D) ≈ [0.5, 0.5, -1/3, 0.0] +@test silhouettes([1, 2, 1, 2], D) ≈ [0.0, -0.5, -0.5, 0.0] +@test silhouettes([1, 1, 1, 2], D) ≈ [0.5, 0.5, -1/3, 0.0] @testset "zero cluster distances correctly" begin a = [fill(1, 5); fill(2, 5)] @@ -42,6 +33,7 @@ c = [3, 1] @test silhouettes(a, d) == fill(0.0, 10) d = fill(1, (10, 10)) + for i in 1:10; d[i, i] = 0; end d[1, 2] = d[2, 1] = 5 @test silhouettes(a, d) == [[-0.5, -0.5]; fill(0.0, 8)] @@ -56,9 +48,36 @@ end end @testset "empty clusters handled correctly (#241)" begin - X = rand(MersenneTwister(123), 10, 5) - pd = pairwise(Euclidean(), X, dims=1) - @test all(>=(-0.5), silhouettes([5, 2, 2, 3, 2, 2, 3, 2, 3, 5], pd)) + X = rand(MersenneTwister(123), 3, 10) + pd = pairwise(Euclidean(), X, dims=2) + asgns = [5, 2, 2, 3, 2, 2, 3, 2, 3, 5] + @test all(>=(-0.5), silhouettes(asgns, pd)) + @test all(>=(-0.5), silhouettes(asgns, X, metric=Euclidean())) +end + +@testset "silhouettes(metric=$metric, batch_size=$(batch_size !== nothing ? batch_size : "nothing"))" for + (metric, batch_size, supported) in [ + (Euclidean(), nothing, true), + (Euclidean(), 1000, true), + (Euclidean(), 10, false), + + (SqEuclidean(), nothing, true), + (SqEuclidean(), 1000, true), + (SqEuclidean(), 10, true), + ] + + Random.seed!(123) + X = rand(3, 100) + pd = pairwise(metric, X, dims=2) + a = rand(1:10, size(X, 2)) + kmeans_clu = kmeans(X, 5) + if supported + @test silhouettes(a, X; metric=metric, batch_size=batch_size) ≈ silhouettes(a, pd) + @test silhouettes(kmeans_clu, X; metric=metric, batch_size=batch_size) ≈ silhouettes(kmeans_clu, pd) + else + @test_throws Exception silhouettes(a, X; metric=metric, batch_size=batch_size) + @test_throws Exception silhouettes(kmeans_clu, X; metric=metric, batch_size=batch_size) + end end end