From ad566e1b00fe6a83b4ef024e9945d24177f9310e Mon Sep 17 00:00:00 2001 From: Patrick Pelletier Date: Sun, 25 Sep 2011 20:18:26 -0700 Subject: [PATCH] Use __builtin_bswap64 if it is available This produces slightly better performance than the inline assembly, and has the added benefit that it should be portable to other systems that use gcc, not just x86-64. Here are the results on my "AMD Athlon(tm) 7450 Dual-Core Processor" with "gcc (Ubuntu 4.3.3-5ubuntu4) 4.3.3": with portable 64H macros: camellia : Schedule at 1659 camellia [ 23]: Encrypt at 431, Decrypt at 434 whirlpool : Process at 55 with inline assembly (with "memory clobber" for correctness): camellia : Schedule at 1380 camellia [ 23]: Encrypt at 406, Decrypt at 403 whirlpool : Process at 50 with __builtin_bswap64: camellia : Schedule at 1352 camellia [ 23]: Encrypt at 396, Decrypt at 391 whirlpool : Process at 46 --- src/headers/tomcrypt_macros.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/headers/tomcrypt_macros.h b/src/headers/tomcrypt_macros.h index 52a1719..732ec3c 100644 --- a/src/headers/tomcrypt_macros.h +++ b/src/headers/tomcrypt_macros.h @@ -96,9 +96,20 @@ asm __volatile__ ( \ #endif +/* gcc 4.3 and up has a bswap builtin */ +#if !defined(LTC_NO_BSWAP) && \ + (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) + +#define STORE64H(x, y) \ + { ulong64 __t = __builtin_bswap64 ((x)); \ + XMEMCPY ((y), &__t, 8); } + +#define LOAD64H(x, y) \ + { XMEMCPY (&(x), (y), 8); \ + (x) = __builtin_bswap64 ((x)); } /* x86_64 processor */ -#if !defined(LTC_NO_BSWAP) && (defined(__GNUC__) && defined(__x86_64__)) +#elif !defined(LTC_NO_BSWAP) && (defined(__GNUC__) && defined(__x86_64__)) #define STORE64H(x, y) \ asm __volatile__ ( \