Use __builtin_bswap64 if it is available
This produces slightly better performance than the inline assembly, and has the added benefit that it should be portable to other systems that use gcc, not just x86-64. Here are the results on my "AMD Athlon(tm) 7450 Dual-Core Processor" with "gcc (Ubuntu 4.3.3-5ubuntu4) 4.3.3": with portable 64H macros: camellia : Schedule at 1659 camellia [ 23]: Encrypt at 431, Decrypt at 434 whirlpool : Process at 55 with inline assembly (with "memory clobber" for correctness): camellia : Schedule at 1380 camellia [ 23]: Encrypt at 406, Decrypt at 403 whirlpool : Process at 50 with __builtin_bswap64: camellia : Schedule at 1352 camellia [ 23]: Encrypt at 396, Decrypt at 391 whirlpool : Process at 46
This commit is contained in:
parent
cefff85550
commit
ad566e1b00
@ -96,9 +96,20 @@ asm __volatile__ ( \
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* gcc 4.3 and up has a bswap builtin */
|
||||||
|
#if !defined(LTC_NO_BSWAP) && \
|
||||||
|
(defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403))
|
||||||
|
|
||||||
|
#define STORE64H(x, y) \
|
||||||
|
{ ulong64 __t = __builtin_bswap64 ((x)); \
|
||||||
|
XMEMCPY ((y), &__t, 8); }
|
||||||
|
|
||||||
|
#define LOAD64H(x, y) \
|
||||||
|
{ XMEMCPY (&(x), (y), 8); \
|
||||||
|
(x) = __builtin_bswap64 ((x)); }
|
||||||
|
|
||||||
/* x86_64 processor */
|
/* x86_64 processor */
|
||||||
#if !defined(LTC_NO_BSWAP) && (defined(__GNUC__) && defined(__x86_64__))
|
#elif !defined(LTC_NO_BSWAP) && (defined(__GNUC__) && defined(__x86_64__))
|
||||||
|
|
||||||
#define STORE64H(x, y) \
|
#define STORE64H(x, y) \
|
||||||
asm __volatile__ ( \
|
asm __volatile__ ( \
|
||||||
|
Loading…
Reference in New Issue
Block a user