Use __builtin_bswap64 if it is available

This produces slightly better performance than the inline assembly,
and has the added benefit that it should be portable to other systems
that use gcc, not just x86-64.

Here are the results on my "AMD Athlon(tm) 7450 Dual-Core Processor"
with "gcc (Ubuntu 4.3.3-5ubuntu4) 4.3.3":

with portable 64H macros:

camellia            : Schedule at   1659
camellia            [ 23]: Encrypt at   431, Decrypt at   434
whirlpool           : Process at    55

with inline assembly (with "memory clobber" for correctness):

camellia            : Schedule at   1380
camellia            [ 23]: Encrypt at   406, Decrypt at   403
whirlpool           : Process at    50

with __builtin_bswap64:

camellia            : Schedule at   1352
camellia            [ 23]: Encrypt at   396, Decrypt at   391
whirlpool           : Process at    46
This commit is contained in:
Patrick Pelletier 2011-09-25 20:18:26 -07:00 committed by Steffen Jaeckel
parent cefff85550
commit ad566e1b00

View File

@ -96,9 +96,20 @@ asm __volatile__ ( \
#endif
/* gcc 4.3 and up has a bswap builtin */
#if !defined(LTC_NO_BSWAP) && \
(defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403))
#define STORE64H(x, y) \
{ ulong64 __t = __builtin_bswap64 ((x)); \
XMEMCPY ((y), &__t, 8); }
#define LOAD64H(x, y) \
{ XMEMCPY (&(x), (y), 8); \
(x) = __builtin_bswap64 ((x)); }
/* x86_64 processor */
#if !defined(LTC_NO_BSWAP) && (defined(__GNUC__) && defined(__x86_64__))
#elif !defined(LTC_NO_BSWAP) && (defined(__GNUC__) && defined(__x86_64__))
#define STORE64H(x, y) \
asm __volatile__ ( \