diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c index 886bcf25e2..304cd0452d 100644 --- a/block-sha1/sha1.c +++ b/block-sha1/sha1.c @@ -82,6 +82,7 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) #define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; }) #define SHA_ROL(x,n) SHA_ASM("rol", x, n) #define SHA_ROR(x,n) SHA_ASM("ror", x, n) +#define SMALL_REGISTER_SET #else @@ -93,7 +94,29 @@ void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) -#define setW(x, val) (*(volatile unsigned int *)&W(x) = (val)) + +/* + * If you have 32 registers or more, the compiler can (and should) + * try to change the array[] accesses into registers. However, on + * machines with less than ~25 registers, that won't really work, + * and at least gcc will make an unholy mess of it. + * + * So to avoid that mess which just slows things down, we force + * the stores to memory to actually happen (we might be better off + * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as + * suggested by Artur Skawina - that will also make gcc unable to + * try to do the silly "optimize away loads" part because it won't + * see what the value will be). + * + * Ben Herrenschmidt reports that on PPC, the C version comes close + * to the optimized asm with this (ie on PPC you don't want that + * 'volatile', since there are lots of registers). + */ +#ifdef SMALL_REGISTER_SET + #define setW(x, val) (*(volatile unsigned int *)&W(x) = (val)) +#else + #define setW(x, val) (W(x) = (val)) +#endif /* * Where do we get the source from? The first 16 iterations get it from