Skip to content

Commit 5bce68f

Browse files
committed
add an AVX-512-optimized single-buffer MD5 implementation
1 parent 7b65d53 commit 5bce68f

File tree

8 files changed

+2136
-7
lines changed

8 files changed

+2136
-7
lines changed

crypto/fipsmodule/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ if(ARCH STREQUAL "x86_64")
3434
ghash-ssse3-x86_64.${ASM_EXT}
3535
ghash-x86_64.${ASM_EXT}
3636
md5-x86_64.${ASM_EXT}
37+
md5-avx512.${ASM_EXT}
3738
p256-x86_64-asm.${ASM_EXT}
3839
p256_beeu-x86_64-asm.${ASM_EXT}
3940
rdrand-x86_64.${ASM_EXT}
@@ -144,6 +145,7 @@ if(PERL_EXECUTABLE)
144145
perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl)
145146
perlasm(md5-armv8.${ASM_EXT} md5/asm/md5-armv8.pl)
146147
perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl)
148+
perlasm(md5-avx512.${ASM_EXT} md5/asm/md5-avx512.pl)
147149
perlasm(p256-x86_64-asm.${ASM_EXT} ec/asm/p256-x86_64-asm.pl)
148150
perlasm(p256_beeu-x86_64-asm.${ASM_EXT} ec/asm/p256_beeu-x86_64-asm.pl)
149151
perlasm(p256-armv8-asm.${ASM_EXT} ec/asm/p256-armv8-asm.pl)
Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
#! /usr/bin/env perl
2+
# Copyright (C) 2025 Intel Corporation
3+
4+
if ($#ARGV < 1) { die "Not enough arguments provided.
5+
Two arguments are necessary: the flavour and the output file path."; }
6+
7+
$flavour = shift;
8+
$output = shift;
9+
10+
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
11+
12+
$avx512md5 = 1;
13+
for (@ARGV) { $avx512md5 = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
14+
15+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
16+
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
17+
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
18+
die "can't locate x86_64-xlate.pl";
19+
20+
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
21+
*STDOUT=*OUT;
22+
23+
#======================================================================
24+
25+
if ($avx512md5) {
26+
27+
my $XMM_STORAGE = 16 * 5;
28+
29+
my $state = "%rdi";
30+
my $data = "%rsi";
31+
my $num = "%rdx";
32+
33+
my $a = "%xmm1";
34+
my $b = "%xmm2";
35+
my $c = "%xmm3";
36+
my $d = "%xmm4";
37+
38+
my $pa = "%xmm5";
39+
my $pb = "%xmm6";
40+
my $pc = "%xmm7";
41+
my $pd = "%xmm8";
42+
43+
sub md5_step {
44+
my ($src, $a, $b, $c, $d, $off, $rot, $t, $imm8) = @_;
45+
46+
# TODO(pittma): At the cost of another register, we can add t and k
47+
# together, and then combine results which may get us better ILP.
48+
$code .= <<___;
49+
vmovd .L_T+4*$t(%rip), %xmm10
50+
vpaddd $off*4($src), %xmm10, %xmm10 # T[i] + k[i]
51+
vpaddd $a, %xmm10, %xmm10 # T[i] + k[i] + a
52+
vmovdqa $b, %xmm9 # preserve b
53+
vpternlogd $imm8, $d, $c, %xmm9 # f(b, c, d)
54+
vpaddd %xmm9, %xmm10, %xmm9 # (T[i] + k[i]) + (f(b, c, d) + a)
55+
vprold \$$rot, %xmm9, %xmm9 # tmp <<< s
56+
vpaddd $b, %xmm9, $a # b + (tmp <<< s)
57+
___
58+
}
59+
60+
sub round1_op {
61+
my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_;
62+
63+
md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0xca");
64+
}
65+
66+
sub round2_op {
67+
my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_;
68+
69+
md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0xe4");
70+
}
71+
72+
sub round3_op {
73+
my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_;
74+
75+
md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0x96");
76+
}
77+
78+
sub round4_op {
79+
my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_;
80+
81+
md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0x39");
82+
}
83+
84+
sub one_round {
85+
my ($src) = @_;
86+
87+
$code .= <<___;
88+
vmovdqa $a, $pa
89+
vmovdqa $b, $pb
90+
vmovdqa $c, $pc
91+
vmovdqa $d, $pd
92+
___
93+
94+
# Round 1
95+
# [ABCD 0 7 1] [DABC 1 12 2] [CDAB 2 17 3] [BCDA 3 22 4]
96+
round1_op($src, $a, $b, $c, $d, 0, 7, 0);
97+
round1_op($src, $d, $a, $b, $c, 1, 12, 1);
98+
round1_op($src, $c, $d, $a, $b, 2, 17, 2);
99+
round1_op($src, $b, $c, $d, $a, 3, 22, 3);
100+
101+
# [ABCD 4 7 5] [DABC 5 12 6] [CDAB 6 17 7] [BCDA 7 22 8]
102+
round1_op($src, $a, $b, $c, $d, 4, 7, 4);
103+
round1_op($src, $d, $a, $b, $c, 5, 12, 5);
104+
round1_op($src, $c, $d, $a, $b, 6, 17, 6);
105+
round1_op($src, $b, $c, $d, $a, 7, 22, 7);
106+
107+
# [ABCD 8 7 9] [DABC 9 12 10] [CDAB 10 17 11] [BCDA 11 22 12]
108+
round1_op($src, $a, $b, $c, $d, 8, 7, 8);
109+
round1_op($src, $d, $a, $b, $c, 9, 12, 9);
110+
round1_op($src, $c, $d, $a, $b, 10, 17, 10);
111+
round1_op($src, $b, $c, $d, $a, 11, 22, 11);
112+
113+
# [ABCD 12 7 13] [DABC 13 12 14] [CDAB 14 17 15] [BCDA 15 22 16]
114+
round1_op($src, $a, $b, $c, $d, 12, 7, 12);
115+
round1_op($src, $d, $a, $b, $c, 13, 12, 13);
116+
round1_op($src, $c, $d, $a, $b, 14, 17, 14);
117+
round1_op($src, $b, $c, $d, $a, 15, 22, 15);
118+
119+
# Round 2
120+
# [ABCD 1 5 17] [DABC 6 9 18] [CDAB 11 14 19] [BCDA 0 20 20]
121+
round2_op($src, $a, $b, $c, $d, 1, 5, 16);
122+
round2_op($src, $d, $a, $b, $c, 6, 9, 17);
123+
round2_op($src, $c, $d, $a, $b, 11, 14, 18);
124+
round2_op($src, $b, $c, $d, $a, 0, 20, 19);
125+
126+
# [ABCD 5 5 21] [DABC 10 9 22] [CDAB 15 14 23] [BCDA 4 20 24]
127+
round2_op($src, $a, $b, $c, $d, 5, 5, 20);
128+
round2_op($src, $d, $a, $b, $c, 10, 9, 21);
129+
round2_op($src, $c, $d, $a, $b, 15, 14, 22);
130+
round2_op($src, $b, $c, $d, $a, 4, 20, 23);
131+
132+
# [ABCD 9 5 25] [DABC 14 9 26] [CDAB 3 14 27] [BCDA 8 20 28]
133+
round2_op($src, $a, $b, $c, $d, 9, 5, 24);
134+
round2_op($src, $d, $a, $b, $c, 14, 9, 25);
135+
round2_op($src, $c, $d, $a, $b, 3, 14, 26);
136+
round2_op($src, $b, $c, $d, $a, 8, 20, 27);
137+
138+
# [ABCD 13 5 29] [DABC 2 9 30] [CDAB 7 14 31] [BCDA 12 20 32]
139+
round2_op($src, $a, $b, $c, $d, 13, 5, 28);
140+
round2_op($src, $d, $a, $b, $c, 2, 9, 29);
141+
round2_op($src, $c, $d, $a, $b, 7, 14, 30);
142+
round2_op($src, $b, $c, $d, $a, 12, 20, 31);
143+
144+
# Round 3
145+
# [ABCD 5 4 33] [DABC 8 11 34] [CDAB 11 16 35] [BCDA 14 23 36]
146+
round3_op($src, $a, $b, $c, $d, 5, 4, 32);
147+
round3_op($src, $d, $a, $b, $c, 8, 11, 33);
148+
round3_op($src, $c, $d, $a, $b, 11, 16, 34);
149+
round3_op($src, $b, $c, $d, $a, 14, 23, 35);
150+
151+
# [ABCD 1 4 37] [DABC 4 11 38] [CDAB 7 16 39] [BCDA 10 23 40]
152+
round3_op($src, $a, $b, $c, $d, 1, 4, 36);
153+
round3_op($src, $d, $a, $b, $c, 4, 11, 37);
154+
round3_op($src, $c, $d, $a, $b, 7, 16, 38);
155+
round3_op($src, $b, $c, $d, $a, 10, 23, 39);
156+
157+
# [ABCD 13 4 41] [DABC 0 11 42] [CDAB 3 16 43] [BCDA 6 23 44]
158+
round3_op($src, $a, $b, $c, $d, 13, 4, 40);
159+
round3_op($src, $d, $a, $b, $c, 0, 11, 41);
160+
round3_op($src, $c, $d, $a, $b, 3, 16, 42);
161+
round3_op($src, $b, $c, $d, $a, 6, 23, 43);
162+
163+
# [ABCD 9 4 45] [DABC 12 11 46] [CDAB 15 16 47] [BCDA 2 23 48]
164+
round3_op($src, $a, $b, $c, $d, 9, 4, 44);
165+
round3_op($src, $d, $a, $b, $c, 12, 11, 45);
166+
round3_op($src, $c, $d, $a, $b, 15, 16, 46);
167+
round3_op($src, $b, $c, $d, $a, 2, 23, 47);
168+
169+
# Round 4
170+
# [ABCD 0 6 49] [DABC 7 10 50] [CDAB 14 15 51] [BCDA 5 21 52]
171+
round4_op($src, $a, $b, $c, $d, 0, 6, 48);
172+
round4_op($src, $d, $a, $b, $c, 7, 10, 49);
173+
round4_op($src, $c, $d, $a, $b, 14, 15, 50);
174+
round4_op($src, $b, $c, $d, $a, 5, 21, 51);
175+
176+
# [ABCD 12 6 53] [DABC 3 10 54] [CDAB 10 15 55] [BCDA 1 21 56]
177+
round4_op($src, $a, $b, $c, $d, 12, 6, 52);
178+
round4_op($src, $d, $a, $b, $c, 3, 10, 53);
179+
round4_op($src, $c, $d, $a, $b, 10, 15, 54);
180+
round4_op($src, $b, $c, $d, $a, 1, 21, 55);
181+
182+
# [ABCD 8 6 57] [DABC 15 10 58] [CDAB 6 15 59] [BCDA 13 21 60]
183+
round4_op($src, $a, $b, $c, $d, 8, 6, 56);
184+
round4_op($src, $d, $a, $b, $c, 15, 10, 57);
185+
round4_op($src, $c, $d, $a, $b, 6, 15, 58);
186+
round4_op($src, $b, $c, $d, $a, 13, 21, 59);
187+
188+
# [ABCD 4 6 61] [DABC 11 10 62] [CDAB 2 15 63] [BCDA 9 21 64]
189+
round4_op($src, $a, $b, $c, $d, 4, 6, 60);
190+
round4_op($src, $d, $a, $b, $c, 11, 10, 61);
191+
round4_op($src, $c, $d, $a, $b, 2, 15, 62);
192+
round4_op($src, $b, $c, $d, $a, 9, 21, 63);
193+
194+
$code .= <<___;
195+
vpaddd $pa, $a, $a
196+
vpaddd $pb, $b, $b
197+
vpaddd $pc, $c, $c
198+
vpaddd $pd, $d, $d
199+
___
200+
}
201+
202+
# int md5_x86_64_avx512(const uint8_t *data,
203+
# size_t len,
204+
# uint8_t out[MD5_DIGEST_LENGTH]);
205+
$code .= <<___;
206+
#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
207+
.text
208+
209+
.globl md5_x86_64_avx512
210+
.type md5_x86_64_avx512,\@function,3
211+
.align 32
212+
md5_x86_64_avx512:
213+
.cfi_startproc
214+
endbranch
215+
___
216+
if ($win64) {
217+
$code .= <<___;
218+
push %rbp
219+
mov %rsp,%rbp
220+
sub $XMM_STORAGE, %rsp
221+
and \$0xfffffffffffffff0,%rsp
222+
vmovdqa %xmm6, 16*0(%rsp)
223+
vmovdqa %xmm7, 16*1(%rsp)
224+
vmovdqa %xmm8, 16*2(%rsp)
225+
vmovdqa %xmm9, 16*3(%rsp)
226+
vmovdqa %xmm10, 16*4(%rsp)
227+
___
228+
}
229+
$code .= <<___;
230+
vmovd 4*0($state), $a
231+
vmovd 4*1($state), $b
232+
vmovd 4*2($state), $c
233+
vmovd 4*3($state), $d
234+
235+
.align 32
236+
.L_main_loop:
237+
___
238+
239+
one_round($data);
240+
241+
$code .= <<___;
242+
add \$64, $data
243+
sub \$1, $num
244+
jnz .L_main_loop
245+
246+
.L_done:
247+
___
248+
if ($win64) {
249+
$code .= <<___;
250+
vmovdqa 16*0(%rsp), %xmm6
251+
vmovdqa 16*1(%rsp), %xmm7
252+
vmovdqa 16*2(%rsp), %xmm8
253+
vmovdqa 16*3(%rsp), %xmm9
254+
vmovdqa 16*4(%rsp), %xmm10
255+
mov %rbp,%rsp
256+
pop %rbp
257+
___
258+
}
259+
260+
$code .= <<___;
261+
vmovd $a, 4*0($state)
262+
vmovd $b, 4*1($state)
263+
vmovd $c, 4*2($state)
264+
vmovd $d, 4*3($state)
265+
ret
266+
.cfi_endproc
267+
.size md5_x86_64_avx512, .-md5_x86_64_avx512
268+
269+
.section .rodata
270+
.align 32
271+
.L_T:
272+
.long 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
273+
.long 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
274+
.long 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
275+
.long 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
276+
.long 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
277+
.long 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
278+
.long 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
279+
.long 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
280+
.long 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
281+
.long 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
282+
.long 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
283+
.long 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
284+
.long 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
285+
.long 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
286+
.long 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
287+
.long 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
288+
#endif
289+
___
290+
291+
} else {
292+
$code = <<___;
293+
.text
294+
.globl md5_x86_64_avx512
295+
md5_x86_64_avx512:
296+
.byte 0x0f,0x0b # ud2
297+
ret
298+
.size md5_x86_64_avx512, .-md5_x86_64_avx512
299+
___
300+
}
301+
302+
print $code;
303+
304+
close STDOUT or die "error closing STDOUT: $!";

crypto/fipsmodule/md5/internal.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,23 @@ OPENSSL_EXPORT int MD5_get_state(MD5_CTX *ctx,
4545
uint8_t out_h[MD5_CHAINING_LENGTH],
4646
uint64_t *out_n);
4747

48-
#if !defined(OPENSSL_NO_ASM) && \
49-
(defined(OPENSSL_X86_64) || defined(OPENSSL_X86) || defined(OPENSSL_AARCH64))
48+
#if !defined(OPENSSL_NO_ASM)
49+
// If building for x86_64 and we have a new enough assembler, we need both
50+
// definitions for the case where we've built for AVX-512, but it is not
51+
// available at runtime.
52+
#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
53+
#define MD5_ASM_AVX512
54+
extern void md5_x86_64_avx512(uint32_t *state, const uint8_t *data,
55+
size_t num);
56+
#endif
57+
58+
#if defined(OPENSSL_X86_64) || defined(OPENSSL_X86) || \
59+
defined(OPENSSL_AARCH64)
5060
#define MD5_ASM
5161
extern void md5_block_asm_data_order(uint32_t *state, const uint8_t *data,
5262
size_t num);
5363
#endif
54-
64+
#endif // !defined(OPENSSL_NO_ASM)
5565

5666
#if defined(__cplusplus)
5767
} // extern "C"

0 commit comments

Comments
 (0)