|
| 1 | +#! /usr/bin/env perl |
| 2 | +# Copyright (C) 2025 Intel Corporation |
| 3 | + |
| 4 | +if ($#ARGV < 1) { die "Not enough arguments provided. |
| 5 | + Two arguments are necessary: the flavour and the output file path."; } |
| 6 | + |
| 7 | +$flavour = shift; |
| 8 | +$output = shift; |
| 9 | + |
| 10 | +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
| 11 | + |
| 12 | +$avx512md5 = 1; |
| 13 | +for (@ARGV) { $avx512md5 = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); } |
| 14 | + |
| 15 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 16 | +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| 17 | +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| 18 | +die "can't locate x86_64-xlate.pl"; |
| 19 | + |
| 20 | +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
| 21 | +*STDOUT=*OUT; |
| 22 | + |
| 23 | +#====================================================================== |
| 24 | + |
| 25 | +if ($avx512md5) { |
| 26 | + |
| 27 | + my $XMM_STORAGE = 16 * 5; |
| 28 | + |
| 29 | + my $state = "%rdi"; |
| 30 | + my $data = "%rsi"; |
| 31 | + my $num = "%rdx"; |
| 32 | + |
| 33 | + my $a = "%xmm1"; |
| 34 | + my $b = "%xmm2"; |
| 35 | + my $c = "%xmm3"; |
| 36 | + my $d = "%xmm4"; |
| 37 | + |
| 38 | + my $pa = "%xmm5"; |
| 39 | + my $pb = "%xmm6"; |
| 40 | + my $pc = "%xmm7"; |
| 41 | + my $pd = "%xmm8"; |
| 42 | + |
| 43 | + sub md5_step { |
| 44 | + my ($src, $a, $b, $c, $d, $off, $rot, $t, $imm8) = @_; |
| 45 | + |
| 46 | + # TODO(pittma): At the cost of another register, we can add t and k |
| 47 | + # together, and then combine results which may get us better ILP. |
| 48 | + $code .= <<___; |
| 49 | + vmovd .L_T+4*$t(%rip), %xmm10 |
| 50 | + vpaddd $off*4($src), %xmm10, %xmm10 # T[i] + k[i] |
| 51 | + vpaddd $a, %xmm10, %xmm10 # T[i] + k[i] + a |
| 52 | + vmovdqa $b, %xmm9 # preserve b |
| 53 | + vpternlogd $imm8, $d, $c, %xmm9 # f(b, c, d) |
| 54 | + vpaddd %xmm9, %xmm10, %xmm9 # (T[i] + k[i]) + (f(b, c, d) + a) |
| 55 | + vprold \$$rot, %xmm9, %xmm9 # tmp <<< s |
| 56 | + vpaddd $b, %xmm9, $a # b + (tmp <<< s) |
| 57 | +___ |
| 58 | + } |
| 59 | + |
| 60 | + sub round1_op { |
| 61 | + my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_; |
| 62 | + |
| 63 | + md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0xca"); |
| 64 | + } |
| 65 | + |
| 66 | + sub round2_op { |
| 67 | + my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_; |
| 68 | + |
| 69 | + md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0xe4"); |
| 70 | + } |
| 71 | + |
| 72 | + sub round3_op { |
| 73 | + my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_; |
| 74 | + |
| 75 | + md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0x96"); |
| 76 | + } |
| 77 | + |
| 78 | + sub round4_op { |
| 79 | + my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_; |
| 80 | + |
| 81 | + md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0x39"); |
| 82 | + } |
| 83 | + |
| 84 | + sub one_round { |
| 85 | + my ($src) = @_; |
| 86 | + |
| 87 | + $code .= <<___; |
| 88 | + vmovdqa $a, $pa |
| 89 | + vmovdqa $b, $pb |
| 90 | + vmovdqa $c, $pc |
| 91 | + vmovdqa $d, $pd |
| 92 | +___ |
| 93 | + |
| 94 | + # Round 1 |
| 95 | + # [ABCD 0 7 1] [DABC 1 12 2] [CDAB 2 17 3] [BCDA 3 22 4] |
| 96 | + round1_op($src, $a, $b, $c, $d, 0, 7, 0); |
| 97 | + round1_op($src, $d, $a, $b, $c, 1, 12, 1); |
| 98 | + round1_op($src, $c, $d, $a, $b, 2, 17, 2); |
| 99 | + round1_op($src, $b, $c, $d, $a, 3, 22, 3); |
| 100 | + |
| 101 | + # [ABCD 4 7 5] [DABC 5 12 6] [CDAB 6 17 7] [BCDA 7 22 8] |
| 102 | + round1_op($src, $a, $b, $c, $d, 4, 7, 4); |
| 103 | + round1_op($src, $d, $a, $b, $c, 5, 12, 5); |
| 104 | + round1_op($src, $c, $d, $a, $b, 6, 17, 6); |
| 105 | + round1_op($src, $b, $c, $d, $a, 7, 22, 7); |
| 106 | + |
| 107 | + # [ABCD 8 7 9] [DABC 9 12 10] [CDAB 10 17 11] [BCDA 11 22 12] |
| 108 | + round1_op($src, $a, $b, $c, $d, 8, 7, 8); |
| 109 | + round1_op($src, $d, $a, $b, $c, 9, 12, 9); |
| 110 | + round1_op($src, $c, $d, $a, $b, 10, 17, 10); |
| 111 | + round1_op($src, $b, $c, $d, $a, 11, 22, 11); |
| 112 | + |
| 113 | + # [ABCD 12 7 13] [DABC 13 12 14] [CDAB 14 17 15] [BCDA 15 22 16] |
| 114 | + round1_op($src, $a, $b, $c, $d, 12, 7, 12); |
| 115 | + round1_op($src, $d, $a, $b, $c, 13, 12, 13); |
| 116 | + round1_op($src, $c, $d, $a, $b, 14, 17, 14); |
| 117 | + round1_op($src, $b, $c, $d, $a, 15, 22, 15); |
| 118 | + |
| 119 | + # Round 2 |
| 120 | + # [ABCD 1 5 17] [DABC 6 9 18] [CDAB 11 14 19] [BCDA 0 20 20] |
| 121 | + round2_op($src, $a, $b, $c, $d, 1, 5, 16); |
| 122 | + round2_op($src, $d, $a, $b, $c, 6, 9, 17); |
| 123 | + round2_op($src, $c, $d, $a, $b, 11, 14, 18); |
| 124 | + round2_op($src, $b, $c, $d, $a, 0, 20, 19); |
| 125 | + |
| 126 | + # [ABCD 5 5 21] [DABC 10 9 22] [CDAB 15 14 23] [BCDA 4 20 24] |
| 127 | + round2_op($src, $a, $b, $c, $d, 5, 5, 20); |
| 128 | + round2_op($src, $d, $a, $b, $c, 10, 9, 21); |
| 129 | + round2_op($src, $c, $d, $a, $b, 15, 14, 22); |
| 130 | + round2_op($src, $b, $c, $d, $a, 4, 20, 23); |
| 131 | + |
| 132 | + # [ABCD 9 5 25] [DABC 14 9 26] [CDAB 3 14 27] [BCDA 8 20 28] |
| 133 | + round2_op($src, $a, $b, $c, $d, 9, 5, 24); |
| 134 | + round2_op($src, $d, $a, $b, $c, 14, 9, 25); |
| 135 | + round2_op($src, $c, $d, $a, $b, 3, 14, 26); |
| 136 | + round2_op($src, $b, $c, $d, $a, 8, 20, 27); |
| 137 | + |
| 138 | + # [ABCD 13 5 29] [DABC 2 9 30] [CDAB 7 14 31] [BCDA 12 20 32] |
| 139 | + round2_op($src, $a, $b, $c, $d, 13, 5, 28); |
| 140 | + round2_op($src, $d, $a, $b, $c, 2, 9, 29); |
| 141 | + round2_op($src, $c, $d, $a, $b, 7, 14, 30); |
| 142 | + round2_op($src, $b, $c, $d, $a, 12, 20, 31); |
| 143 | + |
| 144 | + # Round 3 |
| 145 | + # [ABCD 5 4 33] [DABC 8 11 34] [CDAB 11 16 35] [BCDA 14 23 36] |
| 146 | + round3_op($src, $a, $b, $c, $d, 5, 4, 32); |
| 147 | + round3_op($src, $d, $a, $b, $c, 8, 11, 33); |
| 148 | + round3_op($src, $c, $d, $a, $b, 11, 16, 34); |
| 149 | + round3_op($src, $b, $c, $d, $a, 14, 23, 35); |
| 150 | + |
| 151 | + # [ABCD 1 4 37] [DABC 4 11 38] [CDAB 7 16 39] [BCDA 10 23 40] |
| 152 | + round3_op($src, $a, $b, $c, $d, 1, 4, 36); |
| 153 | + round3_op($src, $d, $a, $b, $c, 4, 11, 37); |
| 154 | + round3_op($src, $c, $d, $a, $b, 7, 16, 38); |
| 155 | + round3_op($src, $b, $c, $d, $a, 10, 23, 39); |
| 156 | + |
| 157 | + # [ABCD 13 4 41] [DABC 0 11 42] [CDAB 3 16 43] [BCDA 6 23 44] |
| 158 | + round3_op($src, $a, $b, $c, $d, 13, 4, 40); |
| 159 | + round3_op($src, $d, $a, $b, $c, 0, 11, 41); |
| 160 | + round3_op($src, $c, $d, $a, $b, 3, 16, 42); |
| 161 | + round3_op($src, $b, $c, $d, $a, 6, 23, 43); |
| 162 | + |
| 163 | + # [ABCD 9 4 45] [DABC 12 11 46] [CDAB 15 16 47] [BCDA 2 23 48] |
| 164 | + round3_op($src, $a, $b, $c, $d, 9, 4, 44); |
| 165 | + round3_op($src, $d, $a, $b, $c, 12, 11, 45); |
| 166 | + round3_op($src, $c, $d, $a, $b, 15, 16, 46); |
| 167 | + round3_op($src, $b, $c, $d, $a, 2, 23, 47); |
| 168 | + |
| 169 | + # Round 4 |
| 170 | + # [ABCD 0 6 49] [DABC 7 10 50] [CDAB 14 15 51] [BCDA 5 21 52] |
| 171 | + round4_op($src, $a, $b, $c, $d, 0, 6, 48); |
| 172 | + round4_op($src, $d, $a, $b, $c, 7, 10, 49); |
| 173 | + round4_op($src, $c, $d, $a, $b, 14, 15, 50); |
| 174 | + round4_op($src, $b, $c, $d, $a, 5, 21, 51); |
| 175 | + |
| 176 | + # [ABCD 12 6 53] [DABC 3 10 54] [CDAB 10 15 55] [BCDA 1 21 56] |
| 177 | + round4_op($src, $a, $b, $c, $d, 12, 6, 52); |
| 178 | + round4_op($src, $d, $a, $b, $c, 3, 10, 53); |
| 179 | + round4_op($src, $c, $d, $a, $b, 10, 15, 54); |
| 180 | + round4_op($src, $b, $c, $d, $a, 1, 21, 55); |
| 181 | + |
| 182 | + # [ABCD 8 6 57] [DABC 15 10 58] [CDAB 6 15 59] [BCDA 13 21 60] |
| 183 | + round4_op($src, $a, $b, $c, $d, 8, 6, 56); |
| 184 | + round4_op($src, $d, $a, $b, $c, 15, 10, 57); |
| 185 | + round4_op($src, $c, $d, $a, $b, 6, 15, 58); |
| 186 | + round4_op($src, $b, $c, $d, $a, 13, 21, 59); |
| 187 | + |
| 188 | + # [ABCD 4 6 61] [DABC 11 10 62] [CDAB 2 15 63] [BCDA 9 21 64] |
| 189 | + round4_op($src, $a, $b, $c, $d, 4, 6, 60); |
| 190 | + round4_op($src, $d, $a, $b, $c, 11, 10, 61); |
| 191 | + round4_op($src, $c, $d, $a, $b, 2, 15, 62); |
| 192 | + round4_op($src, $b, $c, $d, $a, 9, 21, 63); |
| 193 | + |
| 194 | + $code .= <<___; |
| 195 | + vpaddd $pa, $a, $a |
| 196 | + vpaddd $pb, $b, $b |
| 197 | + vpaddd $pc, $c, $c |
| 198 | + vpaddd $pd, $d, $d |
| 199 | +___ |
| 200 | + } |
| 201 | + |
| 202 | + # int md5_x86_64_avx512(const uint8_t *data, |
| 203 | + # size_t len, |
| 204 | + # uint8_t out[MD5_DIGEST_LENGTH]); |
| 205 | + $code .= <<___; |
| 206 | +#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX |
| 207 | + .text |
| 208 | +
|
| 209 | + .globl md5_x86_64_avx512 |
| 210 | + .type md5_x86_64_avx512,\@function,3 |
| 211 | + .align 32 |
| 212 | + md5_x86_64_avx512: |
| 213 | + .cfi_startproc |
| 214 | + endbranch |
| 215 | +___ |
| 216 | + if ($win64) { |
| 217 | + $code .= <<___; |
| 218 | + push %rbp |
| 219 | + mov %rsp,%rbp |
| 220 | + sub $XMM_STORAGE, %rsp |
| 221 | + and \$0xfffffffffffffff0,%rsp |
| 222 | + vmovdqa %xmm6, 16*0(%rsp) |
| 223 | + vmovdqa %xmm7, 16*1(%rsp) |
| 224 | + vmovdqa %xmm8, 16*2(%rsp) |
| 225 | + vmovdqa %xmm9, 16*3(%rsp) |
| 226 | + vmovdqa %xmm10, 16*4(%rsp) |
| 227 | +___ |
| 228 | + } |
| 229 | + $code .= <<___; |
| 230 | + vmovd 4*0($state), $a |
| 231 | + vmovd 4*1($state), $b |
| 232 | + vmovd 4*2($state), $c |
| 233 | + vmovd 4*3($state), $d |
| 234 | +
|
| 235 | + .align 32 |
| 236 | + .L_main_loop: |
| 237 | +___ |
| 238 | + |
| 239 | + one_round($data); |
| 240 | + |
| 241 | + $code .= <<___; |
| 242 | + add \$64, $data |
| 243 | + sub \$1, $num |
| 244 | + jnz .L_main_loop |
| 245 | +
|
| 246 | + .L_done: |
| 247 | +___ |
| 248 | + if ($win64) { |
| 249 | + $code .= <<___; |
| 250 | + vmovdqa 16*0(%rsp), %xmm6 |
| 251 | + vmovdqa 16*1(%rsp), %xmm7 |
| 252 | + vmovdqa 16*2(%rsp), %xmm8 |
| 253 | + vmovdqa 16*3(%rsp), %xmm9 |
| 254 | + vmovdqa 16*4(%rsp), %xmm10 |
| 255 | + mov %rbp,%rsp |
| 256 | + pop %rbp |
| 257 | +___ |
| 258 | + } |
| 259 | + |
| 260 | + $code .= <<___; |
| 261 | + vmovd $a, 4*0($state) |
| 262 | + vmovd $b, 4*1($state) |
| 263 | + vmovd $c, 4*2($state) |
| 264 | + vmovd $d, 4*3($state) |
| 265 | + ret |
| 266 | + .cfi_endproc |
| 267 | + .size md5_x86_64_avx512, .-md5_x86_64_avx512 |
| 268 | +
|
| 269 | + .section .rodata |
| 270 | + .align 32 |
| 271 | + .L_T: |
| 272 | + .long 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee |
| 273 | + .long 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 |
| 274 | + .long 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be |
| 275 | + .long 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 |
| 276 | + .long 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa |
| 277 | + .long 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 |
| 278 | + .long 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed |
| 279 | + .long 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a |
| 280 | + .long 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c |
| 281 | + .long 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 |
| 282 | + .long 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 |
| 283 | + .long 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 |
| 284 | + .long 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 |
| 285 | + .long 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 |
| 286 | + .long 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 |
| 287 | + .long 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 |
| 288 | +#endif |
| 289 | +___ |
| 290 | + |
| 291 | +} else { |
| 292 | + $code = <<___; |
| 293 | + .text |
| 294 | + .globl md5_x86_64_avx512 |
| 295 | + md5_x86_64_avx512: |
| 296 | + .byte 0x0f,0x0b # ud2 |
| 297 | + ret |
| 298 | + .size md5_x86_64_avx512, .-md5_x86_64_avx512 |
| 299 | +___ |
| 300 | +} |
| 301 | + |
| 302 | +print $code; |
| 303 | + |
| 304 | +close STDOUT or die "error closing STDOUT: $!"; |
0 commit comments