<- function(x, y) {
+ concordance_naive <- sign(outer(x, x, `-`))
+ dx <- sign(outer(y, y, `-`))
+ dy .rowSums(dx * dy, nrow(dx), ncol(dx))
+ }
Fast Concordance Vector in R
+1. The problem
+We want, for each observation \(i\), the sum
+\[ +c_i = \sum_{j \ne i} \operatorname{sign}(x_j - x_i)\,\operatorname{sign}(y_j - y_i), +\]
+where sign
is the usual sign function. This is essentially a per-observation contribution to Kendall’s tau.
Naively, this is \(\mathcal{O}(n^2)\), which is infeasible for large \(n\).
++
2. Naive R solution
+The naive R implementation uses outer
to compute all pairwise differences:
This is simple, and for small \(n\) it is actually very fast because everything is vectorized. However, both dx
and dy
are \(n \times n\) matrices, so this is \(\mathcal{O}(n^2)\) in memory and time.
2b. Naive Rcpp solution
+A straightforward Rcpp solution is:
+::sourceCpp(code = r"{
+ Rcpp
+#include <Rcpp.h>
+using namespace Rcpp;
+
+int sign(double x){
+ return x == 0 ? 0 : x > 0 ? 1 : -1;
+}
+
+// Defined in Hollander, Wolfe & Chicken, Nonparametric Statistical Methods, 3ed., eq. (8.5), p. 394
+int Q(double a, double b, double c, double d) {
+ return sign((d-b)*(c-a));
+}
+
+// Defined in Hollander, Wolfe & Chicken, Nonparametric Statistical Methods, 3ed., eq. (8.37), p. 415
+// [[Rcpp::export]]
+IntegerVector concordanceVector_cpp(const NumericVector& x, const NumericVector& y){
+ int n = x.size();
+ IntegerVector out(n);
+
+ for(int i = 0; i < n; i++){
+ out[i] = 0;
+ for(int t = 0; t < n; t++){
+ if(i != t) out[i] += Q(x[i], y[i], x[t], y[t]);
+ }
+ }
+
+ return out;
+}
+}"
+ )
+
3. Optimized solution (Fenwick Tree)
+The key optimization is to avoid the \(n \times n\) matrix entirely. Steps:
+-
+
- Sort points by
x
. Then \(\operatorname{sign}(x_j - x_i)\) is determined by index order, i.e., \(\operatorname{sign}(x_j - x_i) = \operatorname{sign}(j - i)\).
+ - Replace
y
with integer ranks.
+ - Use a Fenwick Tree (Binary Indexed Tree) to efficiently count how many larger/smaller
y
values exist to the left and right of each index.
+ - Combine these counts to get the concordance vector in \(O(n \log n)\). +
This is the same trick used in fast Kendall’s tau algorithms, but adapted to return per-observation contributions.
++
4. Implementation of the optimized solution
+# in-place Fenwick (handles ties in x and y; minimal allocations)
+# the code is a bit messy because R does not allow mutation
+<- function(x, y) {
+ concordance_fenwick_inplace <- length(x)
+ n stopifnot(length(y) == n)
+ if (n == 0L) return(integer(0L))
+ if (n == 1L) return(0L)
+
+# 1) order by x (stable), keep x-ordered copies
+ <- order(x, y, method = "radix")
+ ord <- x[ord]
+ xo <- y[ord]
+ yo
+# 2) compute group runs for equal x (use rle to avoid building big lists)
+ if (anyDuplicated(xo)) {
+ <- rle(xo)
+ rle_x <- rle_x$lengths
+ group_lengths <- cumsum(c(1L, head(group_lengths, -1L)))
+ group_starts <- cumsum(group_lengths)
+ group_ends <- length(group_lengths) # number of groups
+ G else {
+ } # fast path
+ <- rle(xo)
+ rle_x <- rep.int(1L, n)
+ group_lengths <- 1:n
+ group_starts <- 1:n
+ group_ends <- n # number of groups
+ G
+ }
+# 3) compress y to ranks 1..m (strict ordering of unique y's)
+ <- sort(unique(yo))
+ uniq_y <- match(yo, uniq_y) # integer vector 1..m
+ ry <- length(uniq_y)
+ m
+# allocate bit and result vectors (integers)
+ <- integer(m) # 1-indexed BIT (positions 1..m)
+ bit <- integer(n)
+ less_right <- integer(n)
+ greater_right # originally there were separate vectors, but we can reuse some memory
+ # less_left <- integer(n)
+ # greater_left <- integer(n)
+
+# precompute the results of bitwAnd so we can just do lookup
+ # lowbits <- (1:m) - ((1:m) & ((1:m) - 1L))
+ # lowbits <- (1:m) - bitwAnd((1:m), ((1:m) - 1L))
+ #lowbits <- vapply(1:m, \(i) bitwAnd(i, -i), 0L)# -> lowbits
+ <- bitwAnd(1:m, -1:-m)
+ lowbits
+ # Helper: inline bit_sum and bit_add are implemented as loops below
+ # -------- Right sweep (groups processed from right to left) ----------
+ <- 0L
+ total_seen for (g in G:1L) {
+ <- group_starts[g]
+ i1 <- group_ends[g]
+ i2 # query each index in this group (do NOT add them yet)
+ for (i in i1:i2) {
+ <- ry[i]
+ r # less_right: bit_sum(r-1)
+ <- r - 1L
+ j <- 0L
+ s_less while (j > 0L) {
+ <- s_less + bit[j]
+ s_less <- j - lowbits[j] #bitwAnd(j, -j)
+ j
+ }# leq_count: bit_sum(r)
+ <- r
+ j <- 0L
+ s_leq while (j > 0L) {
+ <- s_leq + bit[j]
+ s_leq <- j - lowbits[j] #bitwAnd(j, -j)
+ j
+ }<- s_less
+ less_right[i] <- total_seen - s_leq
+ greater_right[i]
+ }# now add this entire group into the BIT (so earlier groups see them)
+ for (i in i1:i2) {
+ <- ry[i]
+ r <- r
+ j while (j <= m) {
+ <- bit[j] + 1L
+ bit[j] <- j + lowbits[j] #bitwAnd(j, -j)
+ j
+ }<- total_seen + 1L
+ total_seen
+ }
+ }<- (greater_right - less_right) # partial result
+ out_xorder # reset for left sweep, these are now less_left and greater_left
+ <- 0L
+ less_right[] <- 0L
+ greater_right[]
+
+# -------- Left sweep (groups processed from left to right) ----------
+ <- 0L # reset BIT in-place (no new allocation)
+ bit[] <- 0L
+ total_seen for (g in 1L:G) {
+ <- group_starts[g]
+ i1 <- group_ends[g]
+ i2 # query each index in this group (do NOT add them yet)
+ for (i in i1:i2) {
+ <- ry[i]
+ r # less_left: bit_sum(r-1)
+ <- r - 1L
+ j <- 0L
+ s_less while (j > 0L) {
+ <- s_less + bit[j]
+ s_less <- j - lowbits[j] #bitwAnd(j, -j)
+ j
+ }# leq_count: bit_sum(r)
+ <- r
+ j <- 0L
+ s_leq while (j > 0L) {
+ <- s_leq + bit[j]
+ s_leq <- j - lowbits[j] #bitwAnd(j, -j)
+ j
+ }# less_left[i] <- s_less
+ # greater_left[i] <- total_seen - s_leq
+ <- s_less
+ less_right[i] <- total_seen - s_leq
+ greater_right[i]
+ }# now add this group's ranks to BIT
+ for (i in i1:i2) {
+ <- ry[i]
+ r <- r
+ j while (j <= m) {
+ <- bit[j] + 1L
+ bit[j] <- j + lowbits[j] #bitwAnd(j, -j)
+ j
+ }<- total_seen + 1L
+ total_seen
+ }
+ }
+<- out_xorder + (less_right - greater_right)
+ out_xorder # reuse some memory
+ <- 0L
+ less_right[] <- out_xorder
+ less_right[ord] return(less_right)
+
+ # combine contributions (in x-sorted order), then restore original order
+ # out_xorder <- (greater_right - less_right) + (less_left - greater_left)
+ # out <- integer(n)
+ # out[ord] <- out_xorder
+ # out
+ }
+
5. Benchmarking
+We compare the naive and optimized versions using the bench
package.
# results with ties
+<- sample(-5:5, 100, TRUE)
+ x <- sample(-5:5, 100, TRUE)
+ y = concordance_naive(x, y)
+ naive = concordanceVector_cpp(x, y)
+ naive_cpp = concordance_fenwick_inplace(x, y)
+ fenwick2 all(naive == fenwick2) && all(naive_cpp == fenwick2)
[1] TRUE
+# results without ties
+<- rnorm(10)
+ x <- rnorm(10)
+ y = concordance_naive(x, y)
+ naive = concordanceVector_cpp(x, y)
+ naive_cpp = concordance_fenwick_inplace(x, y)
+ fenwick2 all(naive == fenwick2) && all(naive_cpp == fenwick2)
[1] TRUE
+# profvis::profvis({
+# for (i in 1:10000)
+# concordance_fenwick_inplace(x, y)
+# })
+
+<- 3
+ orders <- c(sapply(seq_len(orders), \(o) {
+ ns <- 10^o
+ from <- 10^(o+1) - from
+ to seq(from, to, by = from) # e.g., 10, 100 - 10, 10
+ 10^(orders+1))
+ }), <- suppressWarnings(bench::press(
+ res n = ns,
+
+ {<- rnorm(n)
+ x <- rnorm(n)
+ y ::mark(
+ benchnaive = concordance_naive(x, y),
+ naive_cpp = concordanceVector_cpp(x, y),
+ fenwick = concordance_fenwick_inplace(x, y),
+ check = TRUE, # ensure results are checked for equality
+ iterations = 3
+
+ )
+ },.quiet = !interactive()
+ # suppressWarnings to hide the message "Some expressions had a GC in every iteration; so filtering is disabled."
+ ))
+ res
# A tibble: 84 × 7
+ expression n min median `itr/sec` mem_alloc `gc/sec`
+ <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
+ 1 naive 10 15.75µs 17.71µs 50173. 5.8KB 0
+ 2 naive_cpp 10 1.89µs 2.47µs 328722. 0B 0
+ 3 fenwick 10 125.5µs 145.73µs 6575. 0B 0
+ 4 naive 20 21.59µs 24.81µs 39850. 22.41KB 0
+ 5 naive_cpp 20 2.77µs 4.43µs 218560. 0B 0
+ 6 fenwick 20 138.2µs 158.46µs 6469. 2.7KB 0
+ 7 naive 30 38.7µs 40.75µs 23444. 49.83KB 0
+ 8 naive_cpp 30 5.39µs 5.69µs 145643. 0B 0
+ 9 fenwick 30 165.59µs 178.94µs 5586. 3.41KB 0
+10 naive 40 75.67µs 76.8µs 12557. 88.59KB 0
+# ℹ 74 more rows
++
6. Plot of runtime
+$algorithm <- as.character(res$expression)
+ res<- res$mem_alloc == 0
+ idx_bad $mem_alloc[idx_bad] <- min(res$mem_alloc[!idx_bad]) / 100 # to avoid divide by 0
+ res<- res |>
+ lm_res group_by(algorithm) |>
+ summarize(
+ # Fit the model once and extract coefficients
+ time_fit = list(lm(log10(min) ~ log10(n))),
+ mem_fit = list(lm(log10(mem_alloc) ~ log10(n))),
+ .groups = "drop"
+ |>
+ ) mutate(
+ time_intercept = unname(sapply(time_fit, \(x) coef(x)["(Intercept)"])),
+ time_slope = unname(sapply(time_fit, \(x) coef(x)["log10(n)"])),
+ mem_intercept = unname(sapply(mem_fit, \(x) coef(x)["(Intercept)"])),
+ mem_slope = unname(sapply(mem_fit, \(x) coef(x)["log10(n)"])),
+ # Remove the temporary columns if you don't need them
+ .keep = "unused"
+
+ )
+<- match(c("fenwick", "naive", "naive_cpp"), lm_res$algorithm)
+ idx <- which(lm_res$algorithm == "fenwick")
+ i1 <- seq_along(lm_res$algorithm)[-i1]
+ i2 <- 10^(with(lm_res, (time_intercept[i1] - time_intercept[i2]) / (time_slope[i2] - time_slope[i1])))
+ intersection_n_time <- 10^(with(lm_res, (mem_intercept[i1] - mem_intercept[i2]) / (mem_slope[i2] - mem_slope[i1])))
+ intersection_n_mem <- 10^(with(lm_res, (lm_res$time_intercept[i1] + lm_res$time_slope[i1] * log10(intersection_n_time))))
+ intersection_y_time <- 10^(with(lm_res, (lm_res$mem_intercept[i1] + lm_res$mem_slope[i1] * log10(intersection_n_mem))))
+ intersection_y_mem
+<- round(intersection_n_time)
+ intersection_n_time_rounded
+# df_intersect <- data.frame(
+# n_time = c(intersection_n_time, intersection_n_time),
+# n_mem = c(intersection_n_mem, intersection_n_mem),
+# y_time = c(min(res$min), as_bench_time(intersection_y_time)),
+# y_mem = c(min(res$mem_alloc), as_bench_time(intersection_y_mem))
+# )
+
+<- match(c("fenwick", "naive"), lm_res$algorithm)
+ idx <- data.frame(
+ ref_lines order = c("O(n)", "O(n^2)"),
+ slope = c(1, 2), # Reference slopes for O(n) and O(n^2)
+ time_intercept = lm_res$time_intercept[idx],
+ mem_intercept = lm_res$mem_intercept[idx]
+
+ )
+<- ggplot(res, aes(x = n, y = min, group = algorithm, color = algorithm)) +
+ plt_time # scale_y_log10() +
+ geom_point() +
+ # geom_line(alpha = .4, linetype = "dashed") +
+ scale_x_log10() +
+ # geom_abline(data = lm_res, mapping = aes(slope = time_slope, intercept = time_intercept, color = algorithm), linetype = "dashed") +
+ geom_abline(data = ref_lines, mapping = aes(slope = slope, intercept = time_intercept, color = order), linetype = "dotted") +
+ # geom_line(data = df_intersect, mapping = aes(x = n_time, y = y_time), alpha = .5, color = "grey", inherit.aes = FALSE) +
+ labs(y = "Minimum runtime") +
+ ggtitle("Time")
+ <- ggplot(res, aes(x = n, y = mem_alloc, group = algorithm, color = algorithm)) +
+ plt_memory # scale_y_log10() +
+ geom_point() +
+ # geom_line(alpha = .4, linetype = "dashed") +
+ scale_x_log10() +
+ # geom_abline(data = lm_res, mapping = aes(slope = mem_slope, intercept = mem_intercept, color = algorithm), linetype = "dashed") +
+ geom_abline(data = ref_lines, mapping = aes(slope = slope, intercept = mem_intercept, color = order), linetype = "dotted") +
+ # not meaningul, intersection is before n = 10
+ # geom_line(data = df_intersect, mapping = aes(x = n_mem, y = y_mem), alpha = .5, color = "grey", inherit.aes = FALSE) +
+ labs(y = "Allocated Memory") +
+ ggtitle("Memory")
+
+
+# plotly::subplot(plotly::ggplotly(plt_time), plotly::ggplotly(plt_memory), nrows = 1)
+plot(plt_time + plt_memory + plot_layout(guides = "collect"))
+
7. Conclusions
+-
+
- On my machine, the Fenwick approach outperforms the naive approach around \(n=\) 138 and the naive C++ approach around \(n=\) 608. Theoretically, the naive approach is \(\mathcal{O}(n^2)\) in time, while the Fenwick approaches is \(\mathcal{O}(n \log n)\). +
- The naive approach always uses more memory than the fenwick approach, which uses more memory than the C++ approach. Theoretically, the naive approach is \(\mathcal{O}(n^2)\) in memory, while the Fenwick and C++ approaches are \(\mathcal{O}(n)\). +