Skip to content

Commit 5d1ad38

Browse files
authored
Merge pull request #227 from wildart/cm
fixed ARI (fixes #225 & #226)
2 parents d59b682 + 945c537 commit 5d1ad38

File tree

7 files changed

+121
-28
lines changed

7 files changed

+121
-28
lines changed

doc/source/validate.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,13 @@ the similarity of two different clusterings of a dataset.
9999
```@docs
100100
mutualinfo
101101
```
102+
103+
## Confusion matrix
104+
105+
Pair [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix)
106+
arising from two clusterings is a 2×2 contingency table representation of
107+
the partition co-occurrence, see [`counts`](@ref).
108+
109+
```@docs
110+
confusion
111+
```

src/Clustering.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,10 @@ module Clustering
6565
Hclust, hclust, cutree,
6666

6767
# MCL
68-
mcl, MCLResult
68+
mcl, MCLResult,
69+
70+
# pair confusion matrix
71+
confusion
6972

7073
## source files
7174

@@ -85,6 +88,7 @@ module Clustering
8588
include("varinfo.jl")
8689
include("vmeasure.jl")
8790
include("mutualinfo.jl")
91+
include("confusion.jl")
8892

8993
include("hclust.jl")
9094

src/confusion.jl

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
confusion(a::Union{ClusteringResult, AbstractVector},
3+
b::Union{ClusteringResult, AbstractVector}) -> Matrix{Int}
4+
5+
Return 2×2 confusion matrix `C` that represents partition co-occurrence or
6+
similarity matrix between two clusterings by considering all pairs of samples
7+
and counting pairs that are assigned into the same or into different clusters
8+
under the true and predicted clusterings.
9+
10+
Considering a pair of samples that is in the same group as a **positive pair**,
11+
and a pair is in the different group as a **negative pair**, then the count of
12+
true positives is `C₁₁`, false negatives is `C₁₂`, false positives `C₂₁`, and
13+
true negatives is `C₂₂`:
14+
15+
| | Positive | Negative |
16+
|:--:|:-:|:-:|
17+
|Positive|C₁₁|C₁₂|
18+
|Negative|C₂₁|C₂₂|
19+
"""
20+
function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer})
21+
c = counts(a, b)
22+
23+
n = sum(c)
24+
nis = sum(abs2, sum(c, dims=2)) # sum of squares of sums of rows
25+
njs = sum(abs2, sum(c, dims=1)) # sum of squares of sums of columns
26+
27+
t2 = sum(abs2, c) # sum over rows & columns of nij^2
28+
t3 = nis + njs
29+
C = [(t2 - n)÷2 (nis - t2)÷2; (njs - t2)÷2 (t2 + n^2 - t3)÷2]
30+
return C
31+
end
32+
33+
confusion(a::ClusteringResult, b::ClusteringResult) =
34+
confusion(assignments(a), assignments(b))
35+
confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) =
36+
confusion(a, assignments(b))
37+
confusion(a::ClusteringResult, b::AbstractVector{<:Integer}) =
38+
confusion(assignments(a), b)
39+

src/randindex.jl

Lines changed: 16 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,39 +14,29 @@ Returns a tuple of indices:
1414
1515
# References
1616
> Lawrence Hubert and Phipps Arabie (1985). *Comparing partitions.*
17-
> Journal of Classification 2 (1): 193218
17+
> Journal of Classification 2 (1): 193-218
1818
1919
> Meila, Marina (2003). *Comparing Clusterings by the Variation of
20-
> Information.* Learning Theory and Kernel Machines: 173–187.
20+
> Information.* Learning Theory and Kernel Machines: 173-187.
21+
22+
> Steinley, Douglas (2004). *Properties of the Hubert-Arabie Adjusted
23+
> Rand Index.* Psychological Methods, Vol. 9, No. 3: 386-396
2124
"""
2225
function randindex(a, b)
23-
c = counts(a, b)
24-
25-
n = sum(c)
26-
nis = sum(abs2, sum(c, dims=2)) # sum of squares of sums of rows
27-
njs = sum(abs2, sum(c, dims=1)) # sum of squares of sums of columns
28-
29-
t1 = binomial(n, 2) # total number of pairs of entities
30-
t2 = sum(abs2, c) # sum over rows & columnns of nij^2
31-
t3 = .5*(nis+njs)
32-
33-
# Expected index (for adjustment)
34-
nc = (n*(n^2+1)-(n+1)*nis-(n+1)*njs+2*(nis*njs)/n)/(2*(n-1))
26+
c11, c21, c12, c22 = confusion(a, b) # Table 2 from Steinley 2004
3527

36-
A = t1+t2-t3; # agreements count
37-
D = -t2+t3; # disagreements count
28+
t = c11 + c12 + c21 + c22 # total number of pairs of entities
29+
A = c11 + c22
30+
D = c12 + c21
3831

39-
if t1 == nc
40-
# avoid division by zero; if k=1, define Rand = 0
41-
ARI = 0
42-
else
43-
# adjusted Rand - Hubert & Arabie 1985
44-
ARI = (A-nc)/(t1-nc)
45-
end
32+
# expected index
33+
ERI = (c11+c12)*(c11+c21)+(c21+c22)*(c12+c22)
34+
# adjusted Rand - Hubert & Arabie 1985
35+
ARI = D == 0 ? 1.0 : (t*A-ERI)/(t*t-ERI) # (9) from Steinley 2004
4636

47-
RI = A/t1 # Rand 1971 # Probability of agreement
48-
MI = D/t1 # Mirkin 1970 # p(disagreement)
49-
HI = (A-D)/t1 # Hubert 1977 # p(agree)-p(disagree)
37+
RI = A/t # Rand 1971 # Probability of agreement
38+
MI = D/t # Mirkin 1970 # p(disagreement)
39+
HI = (A-D)/t # Hubert 1977 # p(agree)-p(disagree)
5040

5141
return (ARI, RI, MI, HI)
5242
end

test/confusion.jl

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Test confusion matrix
2+
3+
using Test
4+
using Clustering
5+
6+
@testset "confusion() (Confusion matrix)" begin
7+
8+
@testset "small size tests" begin
9+
@test confusion([0,0,0], [0,0,0]) == [3 0; 0 0]
10+
@test confusion([0,0,1], [0,0,0]) == [1 0; 2 0]
11+
@test confusion([0,1,1], [0,0,0]) == [1 0; 2 0]
12+
@test confusion([1,1,1], [0,0,0]) == [3 0; 0 0]
13+
14+
@test confusion([0,0,0], [0,0,1]) == [1 2; 0 0]
15+
@test confusion([0,0,1], [0,0,1]) == [1 0; 0 2]
16+
@test confusion([0,1,1], [0,0,1]) == [0 1; 1 1]
17+
@test confusion([1,1,1], [0,0,1]) == [1 2; 0 0]
18+
19+
@test confusion([0,0,0], [0,1,1]) == [1 2; 0 0]
20+
@test confusion([0,0,1], [0,1,1]) == [0 1; 1 1]
21+
@test confusion([0,1,1], [0,1,1]) == [1 0; 0 2]
22+
@test confusion([1,1,1], [0,1,1]) == [1 2; 0 0]
23+
24+
@test confusion([0,0,0], [1,1,1]) == [3 0; 0 0]
25+
@test confusion([0,0,1], [1,1,1]) == [1 0; 2 0]
26+
@test confusion([0,1,1], [1,1,1]) == [1 0; 2 0]
27+
@test confusion([1,1,1], [1,1,1]) == [3 0; 0 0]
28+
end
29+
30+
@testset "comparing 2 k-means clusterings" begin
31+
m = 3
32+
n = 100
33+
k = 1
34+
x = rand(m, n)
35+
36+
# non-weighted
37+
r1 = kmeans(x, k; maxiter=5)
38+
r2 = kmeans(x, k; maxiter=5)
39+
C = confusion(r1, r2)
40+
@test C == [n*(n-1)/2 0; 0 0]
41+
end
42+
43+
end
44+

test/randindex.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,9 @@ a3 = [3, 3, 3, 2, 2, 2, 1, 1, 1, 1]
3434

3535
@test randindex(a1, a2) == randindex(a2, a1)
3636

37+
@test randindex(ones(Int, 3), ones(Int, 3)) == (1, 1, 0, 1)
38+
39+
a, b = rand(1:5, 10_000), rand(1:5, 10_000)
40+
@test randindex(a, b)[1] < 1.0e-2
41+
3742
end

test/runtests.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ tests = ["seeding",
1919
"hclust",
2020
"mcl",
2121
"vmeasure",
22-
"mutualinfo"]
22+
"mutualinfo",
23+
"confusion"]
2324

2425
println("Runing tests:")
2526
for t in tests

0 commit comments

Comments
 (0)