// +build gc,!purego,!noasm #include "textflag.h" // Register allocation. #define digest R1 #define h R2 // Return value. #define p R3 // Input pointer. #define len R4 #define nblocks R5 // len / 32. #define prime1 R7 #define prime2 R8 #define prime3 R9 #define prime4 R10 #define prime5 R11 #define v1 R12 #define v2 R13 #define v3 R14 #define v4 R15 #define x1 R20 #define x2 R21 #define x3 R22 #define x4 R23 #define round(acc, x) \ MADD prime2, acc, x, acc \ ROR $64-31, acc \ MUL prime1, acc \ // x = round(0, x). #define round0(x) \ MUL prime2, x \ ROR $64-31, x \ MUL prime1, x \ #define mergeRound(x) \ round0(x) \ EOR x, h \ MADD h, prime4, prime1, h \ // Update v[1-4] with 32-byte blocks. Assumes len >= 32. #define blocksLoop() \ LSR $5, len, nblocks \ PCALIGN $16 \ loop: \ LDP.P 32(p), (x1, x2) \ round(v1, x1) \ LDP -16(p), (x3, x4) \ round(v2, x2) \ SUB $1, nblocks \ round(v3, x3) \ round(v4, x4) \ CBNZ nblocks, loop \ // The primes are repeated here to ensure that they're stored // in a contiguous array, so we can load them with LDP. DATA primes<> +0(SB)/8, $11400714785074694791 DATA primes<> +8(SB)/8, $14029467366897019727 DATA primes<>+16(SB)/8, $1609587929392839161 DATA primes<>+24(SB)/8, $9650029242287828579 DATA primes<>+32(SB)/8, $2870177450012600261 GLOBL primes<>(SB), NOPTR+RODATA, $40 // func Sum64(b []byte) uint64 TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 LDP b_base+0(FP), (p, len) LDP primes<> +0(SB), (prime1, prime2) LDP primes<>+16(SB), (prime3, prime4) MOVD primes<>+32(SB), prime5 CMP $32, len CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 } BLO afterLoop ADD prime1, prime2, v1 MOVD prime2, v2 MOVD $0, v3 NEG prime1, v4 blocksLoop() ROR $64-1, v1, x1 ROR $64-7, v2, x2 ADD x1, x2 ROR $64-12, v3, x3 ROR $64-18, v4, x4 ADD x3, x4 ADD x2, x4, h mergeRound(v1) mergeRound(v2) mergeRound(v3) mergeRound(v4) afterLoop: ADD len, h TBZ $4, len, try8 LDP.P 16(p), (x1, x2) round0(x1) ROR $64-27, h EOR x1 @> 64-27, h, h MADD h, prime4, prime1, h round0(x2) ROR $64-27, h EOR x2 @> 64-27, h MADD h, prime4, prime1, h try8: TBZ $3, len, try4 MOVD.P 8(p), x1 round0(x1) ROR $64-27, h EOR x1 @> 64-27, h MADD h, prime4, prime1, h try4: TBZ $2, len, try2 MOVWU.P 4(p), x2 MUL prime1, x2 ROR $64-23, h EOR x2 @> 64-23, h MADD h, prime3, prime2, h try2: TBZ $1, len, try1 MOVHU.P 2(p), x3 AND $255, x3, x1 LSR $8, x3, x2 MUL prime5, x1 ROR $64-11, h EOR x1 @> 64-11, h MUL prime1, h MUL prime5, x2 ROR $64-11, h EOR x2 @> 64-11, h MUL prime1, h try1: TBZ $0, len, end MOVBU (p), x4 MUL prime5, x4 ROR $64-11, h EOR x4 @> 64-11, h MUL prime1, h end: EOR h >> 33, h MUL prime2, h EOR h >> 29, h MUL prime3, h EOR h >> 32, h MOVD h, ret+24(FP) RET // func writeBlocks(d *Digest, b []byte) int // // Assumes len(b) >= 32. TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 LDP primes<>(SB), (prime1, prime2) // Load state. Assume v[1-4] are stored contiguously. MOVD d+0(FP), digest LDP 0(digest), (v1, v2) LDP 16(digest), (v3, v4) LDP b_base+8(FP), (p, len) blocksLoop() // Store updated state. STP (v1, v2), 0(digest) STP (v3, v4), 16(digest) BIC $31, len MOVD len, ret+32(FP) RET