7bcd3f34e2
They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
102 lines
1.9 KiB
ArmAsm
102 lines
1.9 KiB
ArmAsm
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
|
|
|
|
/* Don't use streaming store because it's better when the target
|
|
ends up in cache. */
|
|
|
|
/* Could vary the prefetch distance based on SMP/UP */
|
|
|
|
.globl copy_page
|
|
.p2align 4
|
|
copy_page:
|
|
subq $3*8,%rsp
|
|
movq %rbx,(%rsp)
|
|
movq %r12,1*8(%rsp)
|
|
movq %r13,2*8(%rsp)
|
|
|
|
movl $(4096/64)-5,%ecx
|
|
.p2align 4
|
|
.Loop64:
|
|
dec %rcx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8 (%rsi), %rbx
|
|
movq 16 (%rsi), %rdx
|
|
movq 24 (%rsi), %r8
|
|
movq 32 (%rsi), %r9
|
|
movq 40 (%rsi), %r10
|
|
movq 48 (%rsi), %r11
|
|
movq 56 (%rsi), %r12
|
|
|
|
prefetcht0 5*64(%rsi)
|
|
|
|
movq %rax, (%rdi)
|
|
movq %rbx, 8 (%rdi)
|
|
movq %rdx, 16 (%rdi)
|
|
movq %r8, 24 (%rdi)
|
|
movq %r9, 32 (%rdi)
|
|
movq %r10, 40 (%rdi)
|
|
movq %r11, 48 (%rdi)
|
|
movq %r12, 56 (%rdi)
|
|
|
|
leaq 64 (%rsi), %rsi
|
|
leaq 64 (%rdi), %rdi
|
|
|
|
jnz .Loop64
|
|
|
|
movl $5,%ecx
|
|
.p2align 4
|
|
.Loop2:
|
|
decl %ecx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8 (%rsi), %rbx
|
|
movq 16 (%rsi), %rdx
|
|
movq 24 (%rsi), %r8
|
|
movq 32 (%rsi), %r9
|
|
movq 40 (%rsi), %r10
|
|
movq 48 (%rsi), %r11
|
|
movq 56 (%rsi), %r12
|
|
|
|
movq %rax, (%rdi)
|
|
movq %rbx, 8 (%rdi)
|
|
movq %rdx, 16 (%rdi)
|
|
movq %r8, 24 (%rdi)
|
|
movq %r9, 32 (%rdi)
|
|
movq %r10, 40 (%rdi)
|
|
movq %r11, 48 (%rdi)
|
|
movq %r12, 56 (%rdi)
|
|
|
|
leaq 64(%rdi),%rdi
|
|
leaq 64(%rsi),%rsi
|
|
|
|
jnz .Loop2
|
|
|
|
movq (%rsp),%rbx
|
|
movq 1*8(%rsp),%r12
|
|
movq 2*8(%rsp),%r13
|
|
addq $3*8,%rsp
|
|
ret
|
|
|
|
/* Some CPUs run faster using the string copy instructions.
|
|
It is also a lot simpler. Use this when possible */
|
|
|
|
#include <asm/cpufeature.h>
|
|
|
|
.section .altinstructions,"a"
|
|
.align 8
|
|
.quad copy_page
|
|
.quad copy_page_c
|
|
.byte X86_FEATURE_REP_GOOD
|
|
.byte copy_page_c_end-copy_page_c
|
|
.byte copy_page_c_end-copy_page_c
|
|
.previous
|
|
|
|
.section .altinstr_replacement,"ax"
|
|
copy_page_c:
|
|
movl $4096/8,%ecx
|
|
rep
|
|
movsq
|
|
ret
|
|
copy_page_c_end:
|
|
.previous
|