tegrakernel/kernel/kernel-4.9/tools/arch/x86/lib/memset_64.S

/* Copyright 2002 Andi Kleen, SuSE Labs */

#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>

.weak memset

/*
 * ISO C memset - set a memory block to a byte value. This function uses fast
 * string to get better performance than the original function. The code is
 * simpler and shorter than the original function as well.
 *
 * rdi   destination
 * rsi   value (char)
 * rdx   count (bytes)
 *
 * rax   original destination
 */
ENTRY(memset)
ENTRY(__memset)
	/*
	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
	 * to use it when possible. If not available, use fast string instructions.
	 *
	 * Otherwise, use original memset function.
	 */
	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
		      "jmp memset_erms", X86_FEATURE_ERMS

	movq %rdi,%r9
	movq %rdx,%rcx
	andl $7,%edx
	shrq $3,%rcx
	/* expand byte value  */
	movzbl %sil,%esi
	movabs $0x0101010101010101,%rax
	imulq %rsi,%rax
	rep stosq
	movl %edx,%ecx
	rep stosb
	movq %r9,%rax
	ret
ENDPROC(memset)
ENDPROC(__memset)

/*
 * ISO C memset - set a memory block to a byte value. This function uses
 * enhanced rep stosb to override the fast string function.
 * The code is simpler and shorter than the fast string function as well.
 *
 * rdi   destination
 * rsi   value (char)
 * rdx   count (bytes)
 *
 * rax   original destination
 */
ENTRY(memset_erms)
	movq %rdi,%r9
	movb %sil,%al
	movq %rdx,%rcx
	rep stosb
	movq %r9,%rax
	ret
ENDPROC(memset_erms)

ENTRY(memset_orig)
	movq %rdi,%r10

	/* expand byte value  */
	movzbl %sil,%ecx
	movabs $0x0101010101010101,%rax
	imulq  %rcx,%rax

	/* align dst */
	movl  %edi,%r9d
	andl  $7,%r9d
	jnz  .Lbad_alignment
.Lafter_bad_alignment:

	movq  %rdx,%rcx
	shrq  $6,%rcx
	jz	 .Lhandle_tail

	.p2align 4
.Lloop_64:
	decq  %rcx
	movq  %rax,(%rdi)
	movq  %rax,8(%rdi)
	movq  %rax,16(%rdi)
	movq  %rax,24(%rdi)
	movq  %rax,32(%rdi)
	movq  %rax,40(%rdi)
	movq  %rax,48(%rdi)
	movq  %rax,56(%rdi)
	leaq  64(%rdi),%rdi
	jnz    .Lloop_64

	/* Handle tail in loops. The loops should be faster than hard
	   to predict jump tables. */
	.p2align 4
.Lhandle_tail:
	movl	%edx,%ecx
	andl    $63&(~7),%ecx
	jz 		.Lhandle_7
	shrl	$3,%ecx
	.p2align 4
.Lloop_8:
	decl   %ecx
	movq  %rax,(%rdi)
	leaq  8(%rdi),%rdi
	jnz    .Lloop_8

.Lhandle_7:
	andl	$7,%edx
	jz      .Lende
	.p2align 4
.Lloop_1:
	decl    %edx
	movb 	%al,(%rdi)
	leaq	1(%rdi),%rdi
	jnz     .Lloop_1

.Lende:
	movq	%r10,%rax
	ret

.Lbad_alignment:
	cmpq $7,%rdx
	jbe	.Lhandle_7
	movq %rax,(%rdi)	/* unaligned store */
	movq $8,%r8
	subq %r9,%r8
	addq %r8,%rdi
	subq %r8,%rdx
	jmp .Lafter_bad_alignment
.Lfinal:
ENDPROC(memset_orig)
initial commit tegra kernel 32.6.1 2022-02-16 09:13:02 -06:00			`/* Copyright 2002 Andi Kleen, SuSE Labs */`

			`#include <linux/linkage.h>`
			`#include <asm/cpufeatures.h>`
			`#include <asm/alternative-asm.h>`

			`.weak memset`

			`/*`
			`* ISO C memset - set a memory block to a byte value. This function uses fast`
			`* string to get better performance than the original function. The code is`
			`* simpler and shorter than the original function as well.`
			`*`
			`* rdi destination`
			`* rsi value (char)`
			`* rdx count (bytes)`
			`*`
			`* rax original destination`
			`*/`
			`ENTRY(memset)`
			`ENTRY(__memset)`
			`/*`
			`* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended`
			`* to use it when possible. If not available, use fast string instructions.`
			`*`
			`* Otherwise, use original memset function.`
			`*/`
			`ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \`
			`"jmp memset_erms", X86_FEATURE_ERMS`

			`movq %rdi,%r9`
			`movq %rdx,%rcx`
			`andl $7,%edx`
			`shrq $3,%rcx`
			`/* expand byte value */`
			`movzbl %sil,%esi`
			`movabs $0x0101010101010101,%rax`
			`imulq %rsi,%rax`
			`rep stosq`
			`movl %edx,%ecx`
			`rep stosb`
			`movq %r9,%rax`
			`ret`
			`ENDPROC(memset)`
			`ENDPROC(__memset)`

			`/*`
			`* ISO C memset - set a memory block to a byte value. This function uses`
			`* enhanced rep stosb to override the fast string function.`
			`* The code is simpler and shorter than the fast string function as well.`
			`*`
			`* rdi destination`
			`* rsi value (char)`
			`* rdx count (bytes)`
			`*`
			`* rax original destination`
			`*/`
			`ENTRY(memset_erms)`
			`movq %rdi,%r9`
			`movb %sil,%al`
			`movq %rdx,%rcx`
			`rep stosb`
			`movq %r9,%rax`
			`ret`
			`ENDPROC(memset_erms)`

			`ENTRY(memset_orig)`
			`movq %rdi,%r10`

			`/* expand byte value */`
			`movzbl %sil,%ecx`
			`movabs $0x0101010101010101,%rax`
			`imulq %rcx,%rax`

			`/* align dst */`
			`movl %edi,%r9d`
			`andl $7,%r9d`
			`jnz .Lbad_alignment`
			`.Lafter_bad_alignment:`

			`movq %rdx,%rcx`
			`shrq $6,%rcx`
			`jz .Lhandle_tail`

			`.p2align 4`
			`.Lloop_64:`
			`decq %rcx`
			`movq %rax,(%rdi)`
			`movq %rax,8(%rdi)`
			`movq %rax,16(%rdi)`
			`movq %rax,24(%rdi)`
			`movq %rax,32(%rdi)`
			`movq %rax,40(%rdi)`
			`movq %rax,48(%rdi)`
			`movq %rax,56(%rdi)`
			`leaq 64(%rdi),%rdi`
			`jnz .Lloop_64`

			`/* Handle tail in loops. The loops should be faster than hard`
			`to predict jump tables. */`
			`.p2align 4`
			`.Lhandle_tail:`
			`movl %edx,%ecx`
			`andl $63&(~7),%ecx`
			`jz .Lhandle_7`
			`shrl $3,%ecx`
			`.p2align 4`
			`.Lloop_8:`
			`decl %ecx`
			`movq %rax,(%rdi)`
			`leaq 8(%rdi),%rdi`
			`jnz .Lloop_8`

			`.Lhandle_7:`
			`andl $7,%edx`
			`jz .Lende`
			`.p2align 4`
			`.Lloop_1:`
			`decl %edx`
			`movb %al,(%rdi)`
			`leaq 1(%rdi),%rdi`
			`jnz .Lloop_1`

			`.Lende:`
			`movq %r10,%rax`
			`ret`

			`.Lbad_alignment:`
			`cmpq $7,%rdx`
			`jbe .Lhandle_7`
			`movq %rax,(%rdi) /* unaligned store */`
			`movq $8,%r8`
			`subq %r9,%r8`
			`addq %r8,%rdi`
			`subq %r8,%rdx`
			`jmp .Lafter_bad_alignment`
			`.Lfinal:`
			`ENDPROC(memset_orig)`