md5: minor optimization in software backend (#755)

newpavlov · web-flow · commit 6aa90e800eb3 · 2025-11-06T07:10:40.000+03:00
Replaces the logical OR in the G function with addition. It seemingly
results in a better ALU utilization and improves performance by several
percents. From 699 MB/s to 753 MB/s on my x86 PC and from 910 MB/s to
960 MB/s on Mac M4.
diff --git a/md5/src/compress/soft.rs b/md5/src/compress/soft.rs
@@ -12,7 +12,11 @@ fn op_f(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
 }
 #[inline(always)]
 fn op_g(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
-    ((x & z) | (y & !z))
+    // We replace the logical OR in `(x & z) | (y & !z)` with addition.
+    // Since masked bits do not overlap, the expressions are equivalent;
+    // however, addition results in better performance on high-end CPUs,
+    // likely due to improved ALU utilization.
+    ((x & z).wrapping_add(y & !z))
         .wrapping_add(w)
         .wrapping_add(m)
         .wrapping_add(c)