tegrakernel/kernel/kernel-4.9/arch/x86/lib/memmove_64.S

/*
 * Normally compiler builtins are used, but sometimes the compiler calls out
 * of line code. Based on asm-i386/string.h.
 *
 * This assembly file is re-written from memmove_64.c file.
 *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
 */
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
#include <asm/export.h>

#undef memmove

/*
 * Implement memmove(). This can handle overlap between src and dst.
 *
 * Input:
 * rdi: dest
 * rsi: src
 * rdx: count
 *
 * Output:
 * rax: dest
 */
.weak memmove

ENTRY(memmove)
ENTRY(__memmove)

	/* Handle more 32 bytes in loop */
	mov %rdi, %rax
	cmp $0x20, %rdx
	jb	1f

	/* Decide forward/backward copy mode */
	cmp %rdi, %rsi
	jge .Lmemmove_begin_forward
	mov %rsi, %r8
	add %rdx, %r8
	cmp %rdi, %r8
	jg 2f

.Lmemmove_begin_forward:
	ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS

	/*
	 * movsq instruction have many startup latency
	 * so we handle small size by general register.
	 */
	cmp  $680, %rdx
	jb	3f
	/*
	 * movsq instruction is only good for aligned case.
	 */

	cmpb %dil, %sil
	je 4f
3:
	sub $0x20, %rdx
	/*
	 * We gobble 32 bytes forward in each loop.
	 */
5:
	sub $0x20, %rdx
	movq 0*8(%rsi), %r11
	movq 1*8(%rsi), %r10
	movq 2*8(%rsi), %r9
	movq 3*8(%rsi), %r8
	leaq 4*8(%rsi), %rsi

	movq %r11, 0*8(%rdi)
	movq %r10, 1*8(%rdi)
	movq %r9, 2*8(%rdi)
	movq %r8, 3*8(%rdi)
	leaq 4*8(%rdi), %rdi
	jae 5b
	addq $0x20, %rdx
	jmp 1f
	/*
	 * Handle data forward by movsq.
	 */
	.p2align 4
4:
	movq %rdx, %rcx
	movq -8(%rsi, %rdx), %r11
	lea -8(%rdi, %rdx), %r10
	shrq $3, %rcx
	rep movsq
	movq %r11, (%r10)
	jmp 13f
.Lmemmove_end_forward:

	/*
	 * Handle data backward by movsq.
	 */
	.p2align 4
7:
	movq %rdx, %rcx
	movq (%rsi), %r11
	movq %rdi, %r10
	leaq -8(%rsi, %rdx), %rsi
	leaq -8(%rdi, %rdx), %rdi
	shrq $3, %rcx
	std
	rep movsq
	cld
	movq %r11, (%r10)
	jmp 13f

	/*
	 * Start to prepare for backward copy.
	 */
	.p2align 4
2:
	cmp $680, %rdx
	jb 6f
	cmp %dil, %sil
	je 7b
6:
	/*
	 * Calculate copy position to tail.
	 */
	addq %rdx, %rsi
	addq %rdx, %rdi
	subq $0x20, %rdx
	/*
	 * We gobble 32 bytes backward in each loop.
	 */
8:
	subq $0x20, %rdx
	movq -1*8(%rsi), %r11
	movq -2*8(%rsi), %r10
	movq -3*8(%rsi), %r9
	movq -4*8(%rsi), %r8
	leaq -4*8(%rsi), %rsi

	movq %r11, -1*8(%rdi)
	movq %r10, -2*8(%rdi)
	movq %r9, -3*8(%rdi)
	movq %r8, -4*8(%rdi)
	leaq -4*8(%rdi), %rdi
	jae 8b
	/*
	 * Calculate copy position to head.
	 */
	addq $0x20, %rdx
	subq %rdx, %rsi
	subq %rdx, %rdi
1:
	cmpq $16, %rdx
	jb 9f
	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	movq 0*8(%rsi), %r11
	movq 1*8(%rsi), %r10
	movq -2*8(%rsi, %rdx), %r9
	movq -1*8(%rsi, %rdx), %r8
	movq %r11, 0*8(%rdi)
	movq %r10, 1*8(%rdi)
	movq %r9, -2*8(%rdi, %rdx)
	movq %r8, -1*8(%rdi, %rdx)
	jmp 13f
	.p2align 4
9:
	cmpq $8, %rdx
	jb 10f
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	movq 0*8(%rsi), %r11
	movq -1*8(%rsi, %rdx), %r10
	movq %r11, 0*8(%rdi)
	movq %r10, -1*8(%rdi, %rdx)
	jmp 13f
10:
	cmpq $4, %rdx
	jb 11f
	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	movl (%rsi), %r11d
	movl -4(%rsi, %rdx), %r10d
	movl %r11d, (%rdi)
	movl %r10d, -4(%rdi, %rdx)
	jmp 13f
11:
	cmp $2, %rdx
	jb 12f
	/*
	 * Move data from 2 bytes to 3 bytes.
	 */
	movw (%rsi), %r11w
	movw -2(%rsi, %rdx), %r10w
	movw %r11w, (%rdi)
	movw %r10w, -2(%rdi, %rdx)
	jmp 13f
12:
	cmp $1, %rdx
	jb 13f
	/*
	 * Move data for 1 byte.
	 */
	movb (%rsi), %r11b
	movb %r11b, (%rdi)
13:
	retq
ENDPROC(__memmove)
ENDPROC(memmove)
EXPORT_SYMBOL(__memmove)
EXPORT_SYMBOL(memmove)
initial commit tegra kernel 32.6.1 2022-02-16 09:13:02 -06:00			`/*`
			`* Normally compiler builtins are used, but sometimes the compiler calls out`
			`* of line code. Based on asm-i386/string.h.`
			`*`
			`* This assembly file is re-written from memmove_64.c file.`
			`* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>`
			`*/`
			`#include <linux/linkage.h>`
			`#include <asm/cpufeatures.h>`
			`#include <asm/alternative-asm.h>`
			`#include <asm/export.h>`

			`#undef memmove`

			`/*`
			`* Implement memmove(). This can handle overlap between src and dst.`
			`*`
			`* Input:`
			`* rdi: dest`
			`* rsi: src`
			`* rdx: count`
			`*`
			`* Output:`
			`* rax: dest`
			`*/`
			`.weak memmove`

			`ENTRY(memmove)`
			`ENTRY(__memmove)`

			`/* Handle more 32 bytes in loop */`
			`mov %rdi, %rax`
			`cmp $0x20, %rdx`
			`jb 1f`

			`/* Decide forward/backward copy mode */`
			`cmp %rdi, %rsi`
			`jge .Lmemmove_begin_forward`
			`mov %rsi, %r8`
			`add %rdx, %r8`
			`cmp %rdi, %r8`
			`jg 2f`

			`.Lmemmove_begin_forward:`
			`ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS`

			`/*`
			`* movsq instruction have many startup latency`
			`* so we handle small size by general register.`
			`*/`
			`cmp $680, %rdx`
			`jb 3f`
			`/*`
			`* movsq instruction is only good for aligned case.`
			`*/`

			`cmpb %dil, %sil`
			`je 4f`
			`3:`
			`sub $0x20, %rdx`
			`/*`
			`* We gobble 32 bytes forward in each loop.`
			`*/`
			`5:`
			`sub $0x20, %rdx`
			`movq 0*8(%rsi), %r11`
			`movq 1*8(%rsi), %r10`
			`movq 2*8(%rsi), %r9`
			`movq 3*8(%rsi), %r8`
			`leaq 4*8(%rsi), %rsi`

			`movq %r11, 0*8(%rdi)`
			`movq %r10, 1*8(%rdi)`
			`movq %r9, 2*8(%rdi)`
			`movq %r8, 3*8(%rdi)`
			`leaq 4*8(%rdi), %rdi`
			`jae 5b`
			`addq $0x20, %rdx`
			`jmp 1f`
			`/*`
			`* Handle data forward by movsq.`
			`*/`
			`.p2align 4`
			`4:`
			`movq %rdx, %rcx`
			`movq -8(%rsi, %rdx), %r11`
			`lea -8(%rdi, %rdx), %r10`
			`shrq $3, %rcx`
			`rep movsq`
			`movq %r11, (%r10)`
			`jmp 13f`
			`.Lmemmove_end_forward:`

			`/*`
			`* Handle data backward by movsq.`
			`*/`
			`.p2align 4`
			`7:`
			`movq %rdx, %rcx`
			`movq (%rsi), %r11`
			`movq %rdi, %r10`
			`leaq -8(%rsi, %rdx), %rsi`
			`leaq -8(%rdi, %rdx), %rdi`
			`shrq $3, %rcx`
			`std`
			`rep movsq`
			`cld`
			`movq %r11, (%r10)`
			`jmp 13f`

			`/*`
			`* Start to prepare for backward copy.`
			`*/`
			`.p2align 4`
			`2:`
			`cmp $680, %rdx`
			`jb 6f`
			`cmp %dil, %sil`
			`je 7b`
			`6:`
			`/*`
			`* Calculate copy position to tail.`
			`*/`
			`addq %rdx, %rsi`
			`addq %rdx, %rdi`
			`subq $0x20, %rdx`
			`/*`
			`* We gobble 32 bytes backward in each loop.`
			`*/`
			`8:`
			`subq $0x20, %rdx`
			`movq -1*8(%rsi), %r11`
			`movq -2*8(%rsi), %r10`
			`movq -3*8(%rsi), %r9`
			`movq -4*8(%rsi), %r8`
			`leaq -4*8(%rsi), %rsi`

			`movq %r11, -1*8(%rdi)`
			`movq %r10, -2*8(%rdi)`
			`movq %r9, -3*8(%rdi)`
			`movq %r8, -4*8(%rdi)`
			`leaq -4*8(%rdi), %rdi`
			`jae 8b`
			`/*`
			`* Calculate copy position to head.`
			`*/`
			`addq $0x20, %rdx`
			`subq %rdx, %rsi`
			`subq %rdx, %rdi`
			`1:`
			`cmpq $16, %rdx`
			`jb 9f`
			`/*`
			`* Move data from 16 bytes to 31 bytes.`
			`*/`
			`movq 0*8(%rsi), %r11`
			`movq 1*8(%rsi), %r10`
			`movq -2*8(%rsi, %rdx), %r9`
			`movq -1*8(%rsi, %rdx), %r8`
			`movq %r11, 0*8(%rdi)`
			`movq %r10, 1*8(%rdi)`
			`movq %r9, -2*8(%rdi, %rdx)`
			`movq %r8, -1*8(%rdi, %rdx)`
			`jmp 13f`
			`.p2align 4`
			`9:`
			`cmpq $8, %rdx`
			`jb 10f`
			`/*`
			`* Move data from 8 bytes to 15 bytes.`
			`*/`
			`movq 0*8(%rsi), %r11`
			`movq -1*8(%rsi, %rdx), %r10`
			`movq %r11, 0*8(%rdi)`
			`movq %r10, -1*8(%rdi, %rdx)`
			`jmp 13f`
			`10:`
			`cmpq $4, %rdx`
			`jb 11f`
			`/*`
			`* Move data from 4 bytes to 7 bytes.`
			`*/`
			`movl (%rsi), %r11d`
			`movl -4(%rsi, %rdx), %r10d`
			`movl %r11d, (%rdi)`
			`movl %r10d, -4(%rdi, %rdx)`
			`jmp 13f`
			`11:`
			`cmp $2, %rdx`
			`jb 12f`
			`/*`
			`* Move data from 2 bytes to 3 bytes.`
			`*/`
			`movw (%rsi), %r11w`
			`movw -2(%rsi, %rdx), %r10w`
			`movw %r11w, (%rdi)`
			`movw %r10w, -2(%rdi, %rdx)`
			`jmp 13f`
			`12:`
			`cmp $1, %rdx`
			`jb 13f`
			`/*`
			`* Move data for 1 byte.`
			`*/`
			`movb (%rsi), %r11b`
			`movb %r11b, (%rdi)`
			`13:`
			`retq`
			`ENDPROC(__memmove)`
			`ENDPROC(memmove)`
			`EXPORT_SYMBOL(__memmove)`
			`EXPORT_SYMBOL(memmove)`