diff options
author | Edward Gillett <Edward.Gillett@Sun.COM> | 2009-09-18 14:25:49 -0700 |
---|---|---|
committer | Edward Gillett <Edward.Gillett@Sun.COM> | 2009-09-18 14:25:49 -0700 |
commit | 533d3a4910febc9985154b885dbe971e3c21ca04 (patch) | |
tree | c0161751a7f2919384220e54bcb6b7c5f7a8ade4 /usr/src/lib/libc | |
parent | a28e62acca32a4c261beff5ecfd9a094a053e145 (diff) | |
download | illumos-joyent-533d3a4910febc9985154b885dbe971e3c21ca04.tar.gz |
6869408 64-bit libc string functions could be improved with SSE
Contributed by Ling Ma <ling.ma@intel.com>, Xinping Huang <xinping.huang@intel.com> and Robert Kasten <robert.a.kasten@intel.com>
Diffstat (limited to 'usr/src/lib/libc')
-rw-r--r-- | usr/src/lib/libc/amd64/gen/proc64_id.c | 3 | ||||
-rw-r--r-- | usr/src/lib/libc/amd64/gen/proc64_id.h | 5 | ||||
-rw-r--r-- | usr/src/lib/libc/amd64/gen/proc64_support.s | 4 | ||||
-rw-r--r-- | usr/src/lib/libc/amd64/gen/strcmp.s | 2223 | ||||
-rw-r--r-- | usr/src/lib/libc/amd64/gen/strcpy.s | 2834 | ||||
-rw-r--r-- | usr/src/lib/libc/amd64/gen/strlen.s | 603 |
6 files changed, 4335 insertions, 1337 deletions
diff --git a/usr/src/lib/libc/amd64/gen/proc64_id.c b/usr/src/lib/libc/amd64/gen/proc64_id.c index eac045037d..656244b4ad 100644 --- a/usr/src/lib/libc/amd64/gen/proc64_id.c +++ b/usr/src/lib/libc/amd64/gen/proc64_id.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2008, Intel Corporation. + * Copyright (c) 2009, Intel Corporation. * All rights reserved. */ @@ -226,6 +226,7 @@ __proc64id(void) if (cpuid_info.edx & CPUID_INTC_EDX_SSE2) { use_sse |= USE_SSE2; } + use_sse |= USE_BSF; __intel_set_memops_method(use_sse); } else { __set_cache_sizes(INTEL_DFLT_L1_CACHE_SIZE, diff --git a/usr/src/lib/libc/amd64/gen/proc64_id.h b/usr/src/lib/libc/amd64/gen/proc64_id.h index 8722e7ff5a..98a00bfa85 100644 --- a/usr/src/lib/libc/amd64/gen/proc64_id.h +++ b/usr/src/lib/libc/amd64/gen/proc64_id.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2008, Intel Corporation + * Copyright (c) 2009, Intel Corporation * All rights reserved. */ @@ -38,7 +38,7 @@ extern "C" { #endif /* - * Defines to determine what SSE instructions can be used for memops or strops. + * Defines to determine what SSE instructions can be used for memops or strops */ #define NO_SSE 0x00 /* Default -- Don't use SSE instructions */ #define USE_SSE2 0x01 /* SSE2 */ @@ -46,6 +46,7 @@ extern "C" { #define USE_SSSE3 0x04 /* Supplemental SSE3 */ #define USE_SSE4_1 0x08 /* SSE 4.1 */ #define USE_SSE4_2 0x10 /* SSE 4.2 */ +#define USE_BSF 0x20 /* USE BSF class of instructions */ /* * Cache size defaults for Core 2 Duo diff --git a/usr/src/lib/libc/amd64/gen/proc64_support.s b/usr/src/lib/libc/amd64/gen/proc64_support.s index 8f499acc38..37a48ee029 100644 --- a/usr/src/lib/libc/amd64/gen/proc64_support.s +++ b/usr/src/lib/libc/amd64/gen/proc64_support.s @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2008, Intel Corporation + * Copyright (c) 2009, Intel Corporation * All rights reserved. */ @@ -38,8 +38,6 @@ * cache size information. Cache information used by memset, strcpy, etc.. */ - .file "proc64_support.s" - #include <sys/asm_linkage.h> #include "proc64_id.h" diff --git a/usr/src/lib/libc/amd64/gen/strcmp.s b/usr/src/lib/libc/amd64/gen/strcmp.s index 13532e2b47..8d04a52534 100644 --- a/usr/src/lib/libc/amd64/gen/strcmp.s +++ b/usr/src/lib/libc/amd64/gen/strcmp.s @@ -1,539 +1,2048 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END */ /* - * Copyright (c) 2002 Advanced Micro Devices, Inc. - * + * Copyright (c) 2009, Intel Corporation * All rights reserved. - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the - * following conditions are met: - * - * + Redistributions of source code must retain the above - * copyright notice, this list of conditions and the - * following disclaimer. - * - * + Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the - * following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - * + Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or - * promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND - * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, - * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * It is licensee's responsibility to comply with any export - * regulations applicable in licensee's jurisdiction. */ - .file "strcmp.s" +/* + * str[n]cmp - compare chars between two string + */ #include "SYS.h" -#include "cache.h" +#include "proc64_id.h" #define LABEL(s) .strcmp/**/s #ifdef USE_AS_STRNCMP + /* + * Since the counter, %r11, is unsigned, we branch to strcmp_exitz + * if the new counter > the old one or is 0. + */ +#define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 +#else +#define UPDATE_STRNCMP_COUNTER +#endif + + /* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCMP ENTRY(strncmp) + test %rdx, %rdx + je LABEL(strcmp_exitz) + mov %rdx, %r11 #else ENTRY(strcmp) /* (const char *, const char *) */ #endif - xor %ecx, %ecx + mov %esi, %ecx + mov %edi, %eax + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ + movlpd (%rdi), %xmm1 + movlpd (%rsi), %xmm2 + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes) /* If not, found mismatch or null char */ +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) /* finish comparision */ +#endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine rdi and rsi string offsets from 16-byte alignment. + * Use relative offset difference between the two to determine which case + * below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi to be 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi to be 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + cmp %eax, %ecx + je LABEL(ashr_0) /* both strings have the same alignment */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + mov %rcx, %r9 + sub %rax, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9, 4), %r9 + lea (%r10, %r9), %r10 + jmp *%r10 /* jump to corresponding case */ +/* + * ashr_0 handles the following cases: + * str1 offset = str2 offset + */ + .p2align 4 +LABEL(ashr_0): + movdqa (%rsi), %xmm1 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + .p2align 4 +LABEL(loop_ashr_0): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) /* mismatch or null char seen */ + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - test %rdx, %rdx /* (const char *, const char *, size_t) */ - mov %r14, -8 (%rsp) - mov %rdx, %r14 - mov %edx, %eax - jz LABEL(exitz) /* early exit */ + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif + add $16, %rcx + jmp LABEL(loop_ashr_0) -LABEL(aligntry): - mov %rsi, %r8 /* align by "source" */ - and $8 - 1, %r8 /* between 0 and 8 characters compared */ - jz LABEL(alignafter) +/* + * ashr_1 handles the following cases: + * abs(str1 offset - str2 offset) = 15 + */ + .p2align 4 +LABEL(ashr_1): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pslldq $15, %xmm2 /* shift first string to align with second */ + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $1, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_1): + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + +LABEL(gobble_ashr_1): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) -LABEL(align): - sub $8, %r8 +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_1) - .p2align 4 + /* + * Nibble avoids loads across page boundary. This is to avoid a potential + * access into unmapped memory. + */ + .p2align 4 +LABEL(nibble_ashr_1): + psrldq $1, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x7fff, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $15, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_1) -LABEL(alignloop): - mov (%rsi, %rcx), %al - mov (%rdi, %rcx), %dl +/* + * ashr_2 handles the following cases: + * abs(str1 offset - str2 offset) = 14 + */ + .p2align 4 +LABEL(ashr_2): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_2): + add $16, %r10 + jg LABEL(nibble_ashr_2) + +LABEL(gobble_ashr_2): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - dec %r14 - jl LABEL(exitafter) + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - cmp %dl, %al /* check if same character */ - jne LABEL(exitafter) - test %al, %al /* check if character a NUL */ - jz LABEL(exitafter) + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_2) /* cross page boundary */ - inc %ecx + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 - inc %r8 - jnz LABEL(alignloop) + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_2) + + .p2align 4 +LABEL(nibble_ashr_2): + psrldq $2, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x3fff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - test %r14, %r14 - jz LABEL(exitafter) + cmp $14, %r11 + jbe LABEL(strcmp_exitz) #endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_2) - .p2align 4 +/* + * ashr_3 handles the following cases: + * abs(str1 offset - str2 offset) = 13 + */ + .p2align 4 +LABEL(ashr_3): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_3): + add $16, %r10 + jg LABEL(nibble_ashr_3) + +LABEL(gobble_ashr_3): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) -LABEL(alignafter): +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif - mov %r15, -32 (%rsp) - mov %rbp, -24 (%rsp) - mov %rbx, -16 (%rsp) + add $16, %rcx + movdqa %xmm4, %xmm3 -LABEL(pagealigntry): /* page align by "destination" */ - lea (%rdi, %rcx), %ebp - mov $AMD64PAGESIZE, %r15d - and $AMD64PAGEMASK, %ebp - sub %r15d, %ebp + add $16, %r10 + jg LABEL(nibble_ashr_3) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_3) + + .p2align 4 +LABEL(nibble_ashr_3): + psrldq $3, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x1fff, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $13, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_3) + +/* + * ashr_4 handles the following cases: + * abs(str1 offset - str2 offset) = 12 + */ + .p2align 4 +LABEL(ashr_4): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* rdi bytes already examined. Used in exit code */ /* - * When we go to 64gobble, %ebp was adjusted at the top of 64loop. - * When we go to 64nibble(crossing page boundary), we'll compare - * 128 byte since we'll fall through to 64gobble. Therefore, %ebp - * needs to be re-adjusted(add 64) when we fall into 64nibble. - * It can be done by adjusting %r15 since %r15 is only used to - * rewind %ebp when crossing page boundary. + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. */ - sub $64, %r15d + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_4): + add $16, %r10 + jg LABEL(nibble_ashr_4) + +LABEL(gobble_ashr_4): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 -LABEL(64): /* 64-byte */ - mov $0xfefefefefefefeff, %rbx /* magic number */ + add $16, %r10 + jg LABEL(nibble_ashr_4) /* cross page boundary */ - .p2align 4 + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 -LABEL(64loop): - add $64, %ebp /* check if "destination" crosses a page unevenly */ - jle LABEL(64gobble) + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 - sub %r15d, %ebp - lea 64 (%rcx), %r8 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - .p2align 4 +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_4) + + .p2align 4 +LABEL(nibble_ashr_4): + psrldq $4, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x0fff, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $12, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_4) -LABEL(64nibble): - mov (%rsi, %rcx), %al - mov (%rdi, %rcx), %dl +/* + * ashr_5 handles the following cases: + * abs(str1 offset - str2 offset) = 11 + */ + .p2align 4 +LABEL(ashr_5): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_5): + add $16, %r10 + jg LABEL(nibble_ashr_5) + +LABEL(gobble_ashr_5): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - dec %r14 - jle LABEL(exit) + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - cmp %dl, %al /* check if same character */ - jne LABEL(exit) - test %al, %al /* check if character a NUL */ - jz LABEL(exit) + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_5) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 - inc %ecx + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - cmp %ecx, %r8d - ja LABEL(64nibble) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif - .p2align 4 + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_5) + + .p2align 4 +LABEL(nibble_ashr_5): + psrldq $5, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x07ff, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $11, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_5) -LABEL(64gobble): - mov (%rsi, %rcx), %rax - mov (%rdi, %rcx), %rdx +/* + * ashr_6 handles the following cases: + * abs(str1 offset - str2 offset) = 10 + */ + .p2align 4 +LABEL(ashr_6): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_6): + add $16, %r10 + jg LABEL(nibble_ashr_6) + +LABEL(gobble_ashr_6): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - sub $8, %r14 - jle LABEL(tail) + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - mov %rbx, %r8 - add %rax, %r8 - sbb %r10, %r10 + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_6) /* cross page boundary */ - mov %rbx, %r9 - add %rdx, %r9 - sbb %r11, %r11 + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 - xor %rax, %r8 - or %rbx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 - xor %rdx, %r9 - or %rbx, %r9 - sub %r11, %r9 - jnz LABEL(tail) + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - cmp %rdx, %rax - jne LABEL(tail) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_6) + + .p2align 4 +LABEL(nibble_ashr_6): + psrldq $6, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x03ff, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $10, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_6) - mov 8 (%rsi, %rcx), %rax - mov 8 (%rdi, %rcx), %rdx - add $8, %ecx +/* + * ashr_7 handles the following cases: + * abs(str1 offset - str2 offset) = 9 + */ + .p2align 4 +LABEL(ashr_7): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_7): + add $16, %r10 + jg LABEL(nibble_ashr_7) + +LABEL(gobble_ashr_7): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - sub $8, %r14 - jle LABEL(tail) + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - mov %rbx, %r8 - add %rax, %r8 - sbb %r10, %r10 + add $16, %rcx + movdqa %xmm4, %xmm3 - mov %rbx, %r9 - add %rdx, %r9 - sbb %r11, %r11 + add $16, %r10 + jg LABEL(nibble_ashr_7) /* cross page boundary */ - xor %rax, %r8 - or %rbx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 - xor %rdx, %r9 - or %rbx, %r9 - sub %r11, %r9 - jnz LABEL(tail) + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 - cmp %rdx, %rax - jne LABEL(tail) + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - mov 8 (%rsi, %rcx), %rax - mov 8 (%rdi, %rcx), %rdx - add $8, %ecx +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_7) + + .p2align 4 +LABEL(nibble_ashr_7): + psrldq $7, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x01ff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - sub $8, %r14 - jle LABEL(tail) + cmp $9, %r11 + jbe LABEL(strcmp_exitz) #endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_7) + +/* + * ashr_8 handles the following cases: + * abs(str1 offset - str2 offset) = 8 + */ + .p2align 4 +LABEL(ashr_8): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_8): + add $16, %r10 + jg LABEL(nibble_ashr_8) + +LABEL(gobble_ashr_8): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - mov %rbx, %r8 - add %rax, %r8 - sbb %r10, %r10 +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif - mov %rbx, %r9 - add %rdx, %r9 - sbb %r11, %r11 + add $16, %rcx + movdqa %xmm4, %xmm3 - xor %rax, %r8 - or %rbx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + add $16, %r10 + jg LABEL(nibble_ashr_8) /* cross page boundary */ - xor %rdx, %r9 - or %rbx, %r9 - sub %r11, %r9 - jnz LABEL(tail) + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 - cmp %rdx, %rax - jne LABEL(tail) + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 - mov 8 (%rsi, %rcx), %rax - mov 8 (%rdi, %rcx), %rdx - add $8, %ecx + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - sub $8, %r14 - jle LABEL(tail) + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - mov %rbx, %r8 - add %rax, %r8 - sbb %r10, %r10 + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_8) + + .p2align 4 +LABEL(nibble_ashr_8): + psrldq $8, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x00ff, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $8, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_8) + +/* + * ashr_9 handles the following cases: + * abs(str1 offset - str2 offset) = 7 + */ + .p2align 4 +LABEL(ashr_9): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_9): + add $16, %r10 + jg LABEL(nibble_ashr_9) + +LABEL(gobble_ashr_9): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_9) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 /* store for next cycle */ + jmp LABEL(loop_ashr_9) + + .p2align 4 +LABEL(nibble_ashr_9): + psrldq $9, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x007f, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $7, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_9) + +/* + * ashr_10 handles the following cases: + * abs(str1 offset - str2 offset) = 6 + */ + .p2align 4 +LABEL(ashr_10): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_10): + add $16, %r10 + jg LABEL(nibble_ashr_10) + +LABEL(gobble_ashr_10): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif - mov %rbx, %r9 - add %rdx, %r9 - sbb %r11, %r11 + add $16, %rcx + movdqa %xmm4, %xmm3 - xor %rax, %r8 - or %rbx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + add $16, %r10 + jg LABEL(nibble_ashr_10) /* cross page boundary */ - xor %rdx, %r9 - or %rbx, %r9 - sub %r11, %r9 - jnz LABEL(tail) + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 - cmp %rdx, %rax - jne LABEL(tail) + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 - mov 8 (%rsi, %rcx), %rax - mov 8 (%rdi, %rcx), %rdx - add $8, %ecx + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - sub $8, %r14 - jle LABEL(tail) + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - mov %rbx, %r8 - add %rax, %r8 - sbb %r10, %r10 + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_10) + + .p2align 4 +LABEL(nibble_ashr_10): + psrldq $10, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x003f, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $6, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_10) + +/* + * ashr_11 handles the following cases: + * abs(str1 offset - str2 offset) = 5 + */ + .p2align 4 +LABEL(ashr_11): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_11): + add $16, %r10 + jg LABEL(nibble_ashr_11) + +LABEL(gobble_ashr_11): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - mov %rbx, %r9 - add %rdx, %r9 - sbb %r11, %r11 +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_11) /* cross page boundary */ - xor %rax, %r8 - or %rbx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 - xor %rdx, %r9 - or %rbx, %r9 - sub %r11, %r9 - jnz LABEL(tail) + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif - cmp %rdx, %rax - jne LABEL(tail) + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_11) + + .p2align 4 +LABEL(nibble_ashr_11): + psrldq $11, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x001f, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $5, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_11) - mov 8 (%rsi, %rcx), %rax - mov 8 (%rdi, %rcx), %rdx - add $8, %ecx +/* + * ashr_12 handles the following cases: + * abs(str1 offset - str2 offset) = 4 + */ + .p2align 4 +LABEL(ashr_12): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_12): + add $16, %r10 + jg LABEL(nibble_ashr_12) + +LABEL(gobble_ashr_12): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - sub $8, %r14 - jle LABEL(tail) + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - mov %rbx, %r8 - add %rax, %r8 - sbb %r10, %r10 + add $16, %rcx + movdqa %xmm4, %xmm3 - mov %rbx, %r9 - add %rdx, %r9 - sbb %r11, %r11 + add $16, %r10 + jg LABEL(nibble_ashr_12) /* cross page boundary */ - xor %rax, %r8 - or %rbx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 - xor %rdx, %r9 - or %rbx, %r9 - sub %r11, %r9 - jnz LABEL(tail) + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 - cmp %rdx, %rax - jne LABEL(tail) + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - mov 8 (%rsi, %rcx), %rax - mov 8 (%rdi, %rcx), %rdx - add $8, %ecx +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_12) + + .p2align 4 +LABEL(nibble_ashr_12): + psrldq $12, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x000f, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $4, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_12) + +/* + * ashr_13 handles the following cases: + * abs(str1 offset - str2 offset) = 3 + */ + .p2align 4 +LABEL(ashr_13): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_13): + add $16, %r10 + jg LABEL(nibble_ashr_13) + +LABEL(gobble_ashr_13): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - sub $8, %r14 - jle LABEL(tail) + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - mov %rbx, %r8 - add %rax, %r8 - sbb %r10, %r10 + add $16, %rcx + movdqa %xmm4, %xmm3 - mov %rbx, %r9 - add %rdx, %r9 - sbb %r11, %r11 + add $16, %r10 + jg LABEL(nibble_ashr_13) /* cross page boundary */ - xor %rax, %r8 - or %rbx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 - xor %rdx, %r9 - or %rbx, %r9 - sub %r11, %r9 - jnz LABEL(tail) + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 - cmp %rdx, %rax - jne LABEL(tail) + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - mov 8 (%rsi, %rcx), %rax - mov 8 (%rdi, %rcx), %rdx - add $8, %ecx +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_13) + + .p2align 4 +LABEL(nibble_ashr_13): + psrldq $13, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x0007, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - sub $8, %r14 - jle LABEL(tail) + cmp $3, %r11 + jbe LABEL(strcmp_exitz) #endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_13) + +/* + * ashr_14 handles the following cases: + * abs(str1 offset - str2 offset) = 2 + */ + .p2align 4 +LABEL(ashr_14): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_14): + add $16, %r10 + jg LABEL(nibble_ashr_14) + +LABEL(gobble_ashr_14): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - mov %rbx, %r8 - add %rax, %r8 - sbb %r10, %r10 +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif - mov %rbx, %r9 - add %rdx, %r9 - sbb %r11, %r11 + add $16, %rcx + movdqa %xmm4, %xmm3 - xor %rax, %r8 - or %rbx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + add $16, %r10 + jg LABEL(nibble_ashr_14) /* cross page boundary */ - xor %rdx, %r9 - or %rbx, %r9 - sub %r11, %r9 - jnz LABEL(tail) + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 - cmp %rdx, %rax - jne LABEL(tail) + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 - add $8, %ecx + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) - jmp LABEL(64loop) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif -LABEL(64after): + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_14) + + .p2align 4 +LABEL(nibble_ashr_14): + psrldq $14, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x0003, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $2, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_14) -LABEL(tailtry): +/* + * ashr_15 handles the following cases: + * abs(str1 offset - str2 offset) = 1 + */ + .p2align 4 +LABEL(ashr_15): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* rdi bytes already examined. Used in exit code */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we are crossing a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + movdqa %xmm3, %xmm4 + + .p2align 4 +LABEL(loop_ashr_15): + add $16, %r10 + jg LABEL(nibble_ashr_15) + +LABEL(gobble_ashr_15): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) -LABEL(tail): /* byte tail */ #ifdef USE_AS_STRNCMP - add $7, %r14 + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - cmp %dl, %al /* check if same character */ - jne LABEL(exit) - test %al, %al /* check if character a NUL */ - jz LABEL(exit) + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_15) /* cross page boundary */ - shr $8, %rax - shr $8, %rdx + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) #ifdef USE_AS_STRNCMP - dec %r14 - jl LABEL(exit) + sub $16, %r11 + jbe LABEL(strcmp_exitz) #endif - cmp %dl, %al - jne LABEL(exit) - test %al, %al - jz LABEL(exit) + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_15) + + .p2align 4 +LABEL(nibble_ashr_15): + psrldq $15, %xmm4 + movdqa (%rsi, %rcx), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0x0001, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + cmp $1, %r11 + jbe LABEL(strcmp_exitz) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* subtract 4K from %r10 */ + jmp LABEL(gobble_ashr_15) + + .p2align 4 +LABEL(exit): + lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + /* + * Check to see if BSF is fast on this processor. If not, use a different + * exit tail. + */ + testl $USE_BSF,.memops_method(%rip) + jz LABEL(AMD_exit) + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +#ifdef USE_AS_STRNCMP + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +#endif + xor %ecx, %ecx /* clear %ecx */ + xor %eax, %eax /* clear %eax */ + + movb (%rsi, %rdx), %cl + movb (%rdi, %rdx), %al - shr $8, %rax - shr $8, %rdx + sub %ecx, %eax + ret #ifdef USE_AS_STRNCMP - dec %r14 - jl LABEL(exit) +LABEL(strcmp_exitz): + xor %eax, %eax + ret #endif - cmp %dl, %al - jne LABEL(exit) - test %al, %al - jz LABEL(exit) + /* + * This exit tail does not use the bsf instruction. + */ + .p2align 4 +LABEL(AMD_exit): + test %dl, %dl + jz LABEL(next_8_bytes) + + test $0x01, %dl + jnz LABEL(Byte0) + + test $0x02, %dl + jnz LABEL(Byte1) + + test $0x04, %dl + jnz LABEL(Byte2) + + test $0x08, %dl + jnz LABEL(Byte3) + + test $0x10, %dl + jnz LABEL(Byte4) + + test $0x20, %dl + jnz LABEL(Byte5) + + test $0x40, %dl + jnz LABEL(Byte6) + +#ifdef USE_AS_STRNCMP + sub $7, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 7(%rsi), %ecx + movzx 7(%rdi), %eax - shr $8, %rax - shr $8, %rdx + sub %ecx, %eax + ret + .p2align 4 +LABEL(Byte0): + /* + * never need to handle byte 0 for strncmpy #ifdef USE_AS_STRNCMP - dec %r14 - jl LABEL(exit) + sub $0, %r11 + jbe LABEL(strcmp_exitz) #endif + */ + movzx (%rsi), %ecx + movzx (%rdi), %eax - cmp %dl, %al - jne LABEL(exit) - test %al, %al - jz LABEL(exit) + sub %ecx, %eax + ret - shr $8, %rax - shr $8, %rdx + .p2align 4 +LABEL(Byte1): #ifdef USE_AS_STRNCMP - dec %r14 - jl LABEL(exit) + sub $1, %r11 + jbe LABEL(strcmp_exitz) #endif + movzx 1(%rsi), %ecx + movzx 1(%rdi), %eax - cmp %dl, %al - jne LABEL(exit) - test %al, %al - jz LABEL(exit) + sub %ecx, %eax + ret - shr $8, %eax - shr $8, %edx + .p2align 4 +LABEL(Byte2): #ifdef USE_AS_STRNCMP - dec %r14 - jl LABEL(exit) + sub $2, %r11 + jbe LABEL(strcmp_exitz) #endif + movzx 2(%rsi), %ecx + movzx 2(%rdi), %eax - cmp %dl, %al - jne LABEL(exit) - test %al, %al - jz LABEL(exit) + sub %ecx, %eax + ret - shr $8, %eax - shr $8, %edx + .p2align 4 +LABEL(Byte3): #ifdef USE_AS_STRNCMP - dec %r14 - jl LABEL(exit) + sub $3, %r11 + jbe LABEL(strcmp_exitz) #endif + movzx 3(%rsi), %ecx + movzx 3(%rdi), %eax - cmp %dl, %al - jne LABEL(exit) - test %al, %al - jz LABEL(exit) + sub %ecx, %eax + ret - shr $8, %eax - shr $8, %edx + .p2align 4 +LABEL(Byte4): #ifdef USE_AS_STRNCMP - dec %r14 - jl LABEL(exit) + sub $4, %r11 + jbe LABEL(strcmp_exitz) #endif + movzx 4(%rsi), %ecx + movzx 4(%rdi), %eax - cmp %dl, %al - jne LABEL(exit) + sub %ecx, %eax + ret - .p2align 4,, 15 + .p2align 4 +LABEL(Byte5): -LABEL(tailafter): +#ifdef USE_AS_STRNCMP + sub $5, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 5(%rsi), %ecx + movzx 5(%rdi), %eax -LABEL(exit): - mov -32 (%rsp), %r15 - mov -24 (%rsp), %rbp - mov -16 (%rsp), %rbx + sub %ecx, %eax + ret - .p2align 4,, 3 + .p2align 4 +LABEL(Byte6): -LABEL(exitafter): #ifdef USE_AS_STRNCMP - test %r14, %r14 - cmovl %edx, %eax + sub $6, %r11 + jbe LABEL(strcmp_exitz) #endif + movzx 6(%rsi), %ecx + movzx 6(%rdi), %eax - movzx %al, %eax - movzx %dl, %edx - sub %eax, %edx - xchg %edx, %eax + sub %ecx, %eax + ret + .p2align 4 +LABEL(next_8_bytes): + add $8, %rdi + add $8, %rsi #ifdef USE_AS_STRNCMP -LABEL(exitz): - mov -8 (%rsp), %r14 + sub $8, %r11 + jbe LABEL(strcmp_exitz) #endif - ret + test $0x01, %dh + jnz LABEL(Byte0) + + test $0x02, %dh + jnz LABEL(Byte1) + test $0x04, %dh + jnz LABEL(Byte2) + + test $0x08, %dh + jnz LABEL(Byte3) + + test $0x10, %dh + jnz LABEL(Byte4) + + test $0x20, %dh + jnz LABEL(Byte5) + + test $0x40, %dh + jnz LABEL(Byte6) + +#ifdef USE_AS_STRNCMP + sub $7, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 7(%rsi), %ecx + movzx 7(%rdi), %eax + + sub %ecx, %eax + ret + + .pushsection .rodata + .p2align 4 +LABEL(unaligned_table): + .int LABEL(ashr_0) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_1) - LABEL(unaligned_table) + .popsection #ifdef USE_AS_STRNCMP SET_SIZE(strncmp) #else diff --git a/usr/src/lib/libc/amd64/gen/strcpy.s b/usr/src/lib/libc/amd64/gen/strcpy.s index f4de3ab1f1..080fe913ae 100644 --- a/usr/src/lib/libc/amd64/gen/strcpy.s +++ b/usr/src/lib/libc/amd64/gen/strcpy.s @@ -1,862 +1,2582 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END */ /* - * Copyright (c) 2002 Advanced Micro Devices, Inc. - * + * Copyright (c) 2009, Intel Corporation * All rights reserved. - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the - * following conditions are met: - * - * + Redistributions of source code must retain the above - * copyright notice, this list of conditions and the - * following disclaimer. - * - * + Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the - * following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - * + Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or - * promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND - * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, - * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * It is licensee's responsibility to comply with any export - * regulations applicable in licensee's jurisdiction. */ - .file "strcpy.s" - +/* + * str[n]cpy - copy [n] chars from second operand into first operand + */ #include "SYS.h" -#include "cache.h" +#include "proc64_id.h" #define LABEL(s) .strcpy/**/s #ifdef USE_AS_STRNCPY ENTRY(strncpy) + test %edx, %edx + jz LABEL(strncpy_exitz) + mov %rdx, %r8 #else - ENTRY(strcpy) /* (char *, const char *) */ + ENTRY(strcpy) /* (char *, const char *) */ + xor %rdx, %rdx +#endif + mov %esi, %ecx + and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ + and $0xf, %rcx + mov %rdi, %rax /* save destination address for return value */ + + + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for null */ + pmovmskb %xmm0, %edx + shr %cl, %edx /* adjust for offset from 16byte boundary */ + test %edx, %edx /* edx will be 0 if chars are non-null */ + jnz LABEL(less16bytes) /* null char found in first 16 bytes examined */ +#ifdef USE_AS_STRNCPY + /* + * Check if the count is satisfied in first 16 bytes examined. + */ + lea -16(%r8, %rcx), %r11 + cmp $0, %r11 + jle LABEL(less16bytes) +#endif + mov %rcx, %r9 /* rsi alignment offset */ + or %edi, %ecx + and $0xf, %ecx + lea -16(%r9), %r10 + jz LABEL(ashr_0) /* src and dest are both 16 byte aligned */ + + neg %r10 /* max src bytes remaining in current dqword */ + + pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation */ + pcmpeqb 16(%rsi), %xmm0 /* check next 16 bytes in src for a null */ + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(less32bytes) /* null char found in first 32 bytes examined */ + +#ifdef USE_AS_STRNCPY + /* + * If strncpy count <= 16 go to exit case + */ + sub $16, %r8 + jbe LABEL(less32bytes_strncpy_truncation) +#endif + /* + * At least 16 bytes to copy to destination string. Move them now. + * Don't worry about alignment. + */ + mov (%rsi, %r9), %rdx + mov %rdx, (%rdi) + mov 8(%rsi, %r9), %rdx + mov %rdx, 8(%rdi) + + /* + * so far destination rdi may be aligned by 16, re-calculate rsi and + * jump to corresponding src/dest relative offset case. + * rcx is offset of rsi + * rdx is offset of rdi + */ + and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ + mov %rax, %rdx /* rax contains orignal rdi */ + xor %rdi, %rdx /* same effect as "and $0xf, %rdx" */ +#ifdef USE_AS_STRNCPY + /* + * Will now do 16 byte aligned stores. Stores may overlap some bytes + * (ie store twice) if destination was unaligned. Compensate here. + */ + add %rdx, %r8 /* compensate for overlap */ +#endif + + add $16, %rdi /* next 16 bytes for dest */ + + /* + * align src to 16-byte boundary. Could be up or down depending on + * whether src offset - dest offset > 0 (up) or + * src offset - dest offset < 0 (down). + */ + sub %rdx, %r9 /* src offset - dest offset */ + + lea 16(%r9, %rsi), %rsi + mov %esi, %ecx /* for new src offset */ + and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ + + and $0xf, %ecx /* new src offset is 0 if rsi/rdi have same alignment */ + jz LABEL(ashr_0) + +#ifdef USE_AS_STRNCPY + xor %edx, %edx /* In case unaligned_exit is taken */ +#endif + /* + * Jump to case corresponding to source/dest string relative offsets + * Index = (16 + (src offset - dest offset)) % 16 + */ + lea -16(%rcx), %r10 + mov %rcx, %r9 + neg %r10 /* max src bytes remaining in current dqword */ + lea LABEL(unaligned_table)(%rip), %r11 + movslq (%r11, %rcx, 4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + +/* + * ashr_0 handles the following cases: + * src alignment offset = dest alignment offset + */ + .p2align 5 +LABEL(ashr_0): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) #endif + movdqa (%rsi), %xmm1 /* fetch 16 bytes from src string */ + movdqa %xmm1, (%rdi) /* store 16 bytes into dest string */ + add $16, %rsi + add $16, %rdi + pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for a null */ + pmovmskb %xmm0, %edx + + test %edx, %edx /* edx will be 0 if chars are non-null */ + jnz LABEL(aligned_16bytes) /* exit tail */ + +LABEL(ashr_0_loop): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) #ifdef USE_AS_STRNCPY - test %rdx, %rdx /* (char *, const char *, size_t) */ - mov %rdx, %r11 - jz LABEL(exitn) /* early exit */ + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) #endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) - xor %edx, %edx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) -LABEL(aligntry): - mov %rsi, %r8 /* align by source */ - and $7, %r8 - jz LABEL(alignafter) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) -LABEL(align): /* 8-byte align */ - sub $8, %r8 +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jz LABEL(ashr_0_loop) + jmp LABEL(aligned_exit) + + +/* + * ashr_15 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 15 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_15): + xor %ecx, %ecx /* clear index */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_15_use_sse2) .p2align 4 +LABEL(ashr_15_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + #palignr $15, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0f -LABEL(alignloop): + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - dec %r11 - jl LABEL(exitn) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov (%rsi, %rdx), %al /* check if same character */ - test %al, %al /* check if character a NUL */ - mov %al, (%rdi, %rdx) - jz LABEL(exit) + #palignr $15, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0f - inc %edx - inc %r8 - jnz LABEL(alignloop) + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - test %r11, %r11 /* must check remaining size */ - jz LABEL(exitn) /* If we've already done, exit */ + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + jmp LABEL(ashr_15_use_ssse3) .p2align 4 +LABEL(ashr_15_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif -LABEL(alignafter): + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 -LABEL(8try): - mov $0xfefefefefefefeff, %rcx + psrldq $15, %xmm2 + pslldq $1, %xmm3 + por %xmm2, %xmm3 -LABEL(8): /* 8-byte */ - mov (%rsi, %rdx), %rax + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 -LABEL(8loop): + psrldq $15, %xmm2 + pslldq $1, %xmm3 + por %xmm2, %xmm3 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + jmp LABEL(ashr_15_use_sse2) + - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 +/* + * ashr_14 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 14 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_14): + xor %ecx, %ecx /* clear index */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_14_use_sse2) + + .p2align 4 +LABEL(ashr_14_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + #palignr $14, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0e - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + #palignr $14, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0e + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_14_use_ssse3) + + .p2align 4 +LABEL(ashr_14_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $14, %xmm2 + pslldq $2, %xmm3 + por %xmm2, %xmm3 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $14, %xmm2 + pslldq $2, %xmm3 + por %xmm2, %xmm3 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_14_use_sse2) + +/* + * ashr_13 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 13 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_13): + xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_13_use_sse2) - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + .p2align 4 +LABEL(ashr_13_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + #palignr $13, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0d - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + #palignr $13, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0d + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_13_use_ssse3) + + .p2align 4 +LABEL(ashr_13_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $13, %xmm2 + pslldq $3, %xmm3 + por %xmm2, %xmm3 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $13, %xmm2 + pslldq $3, %xmm3 + por %xmm2, %xmm3 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_13_use_sse2) - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx +/* + * ashr_12 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 12 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_12): + xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_12_use_sse2) - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + .p2align 4 +LABEL(ashr_12_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + #palignr $12, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0c - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + #palignr $12, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0c - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_12_use_ssse3) - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + .p2align 4 +LABEL(ashr_12_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 + + psrldq $12, %xmm2 + pslldq $4, %xmm3 + por %xmm2, %xmm3 -LABEL(8after): + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx -LABEL(64try): - mov _sref_(.amd64cache1half), %r9 +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif -LABEL(64): /* 64-byte */ + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - .p2align 4 + psrldq $12, %xmm2 + pslldq $4, %xmm3 + por %xmm2, %xmm3 -LABEL(64loop): + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + jmp LABEL(ashr_12_use_sse2) - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) +/* + * ashr_11 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 11 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_11): + xor %ecx, %ecx /* clear index */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_11_use_sse2) - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + .p2align 4 +LABEL(ashr_11_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + #palignr $11, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0b + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + #palignr $11, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0b + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_11_use_ssse3) + + .p2align 4 +LABEL(ashr_11_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $11, %xmm2 + pslldq $5, %xmm3 + por %xmm2, %xmm3 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $11, %xmm2 + pslldq $5, %xmm3 + por %xmm2, %xmm3 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_11_use_sse2) - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx +/* + * ashr_10 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 10 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_10): + xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_10_use_sse2) - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + .p2align 4 +LABEL(ashr_10_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + #palignr $10, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0a + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + #palignr $10, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x0a - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_10_use_ssse3) + .p2align 4 +LABEL(ashr_10_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $10, %xmm2 + pslldq $6, %xmm3 + por %xmm2, %xmm3 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 + + psrldq $10, %xmm2 + pslldq $6, %xmm3 + por %xmm2, %xmm3 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_10_use_sse2) - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx +/* + * ashr_9 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 9 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_9): + xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_9_use_sse2) - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + .p2align 4 +LABEL(ashr_9_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + #palignr $9, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x09 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + #palignr $9, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x09 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_9_use_ssse3) - cmp %r9, %rdx + .p2align 4 +LABEL(ashr_9_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - lea 8 (%rdx), %rdx + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - jbe LABEL(64loop) + psrldq $9, %xmm2 + pslldq $7, %xmm3 + por %xmm2, %xmm3 -LABEL(64after): + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx -LABEL(pretry): - mov _sref_(.amd64cache2half), %r9 +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 -LABEL(pre): /* 64-byte prefetch */ + psrldq $9, %xmm2 + pslldq $7, %xmm3 + por %xmm2, %xmm3 - .p2align 4 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_9_use_sse2) + + +/* + * ashr_8 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 8 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_8): + xor %ecx, %ecx /* clear index */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_8_use_sse2) -LABEL(preloop): + .p2align 4 +LABEL(ashr_8_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + #palignr $8, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x08 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + #palignr $8, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x08 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_8_use_ssse3) + .p2align 4 +LABEL(ashr_8_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $8, %xmm2 + pslldq $8, %xmm3 + por %xmm2, %xmm3 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $8, %xmm2 + pslldq $8, %xmm3 + por %xmm2, %xmm3 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_8_use_sse2) + +/* + * ashr_7 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 7 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_7): + xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_7_use_sse2) - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + .p2align 4 +LABEL(ashr_7_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + #palignr $7, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x07 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + #palignr $7, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x07 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_7_use_ssse3) + .p2align 4 +LABEL(ashr_7_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $7, %xmm2 + pslldq $9, %xmm3 + por %xmm2, %xmm3 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + psrldq $7, %xmm2 + pslldq $9, %xmm3 + por %xmm2, %xmm3 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_7_use_sse2) - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx +/* + * ashr_6 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 6 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_6): + xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_6_use_sse2) - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + .p2align 4 +LABEL(ashr_6_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + #palignr $6, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x06 - mov %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %edx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + #palignr $6, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x06 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(tail) + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_6_use_ssse3) - cmp %r9, %rdx + .p2align 4 +LABEL(ashr_6_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - mov %rax, (%rdi, %rdx) - prefetchnta 512 + 8 (%rdi, %rdx) /* 3DNow: use prefetchw */ - mov 8 (%rsi, %rdx), %rax - prefetchnta 512 + 8 (%rsi, %rdx) /* 3DNow: use prefetch */ - lea 8 (%rdx), %rdx + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - jb LABEL(preloop) + psrldq $6, %xmm2 + pslldq $10, %xmm3 + por %xmm2, %xmm3 - .p2align 4 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx -LABEL(preafter): +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif -LABEL(NTtry): - mfence + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 -LABEL(NT): /* 64-byte NT */ + psrldq $6, %xmm2 + pslldq $10, %xmm3 + por %xmm2, %xmm3 - .p2align 4 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_6_use_sse2) + + +/* + * ashr_5 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 5 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_5): + xor %ecx, %ecx /* clear index */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_5_use_sse2) -LABEL(NTloop): + .p2align 4 +LABEL(ashr_5_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + #palignr $5, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x05 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(NTtail) + #palignr $5, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x05 - movnti %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %rdx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_5_use_ssse3) + .p2align 4 +LABEL(ashr_5_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(NTtail) + psrldq $5, %xmm2 + pslldq $11, %xmm3 + por %xmm2, %xmm3 - movnti %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %rdx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(NTtail) + psrldq $5, %xmm2 + pslldq $11, %xmm3 + por %xmm2, %xmm3 - movnti %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %rdx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_5_use_sse2) + +/* + * ashr_4 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 4 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_4): + xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_4_use_sse2) - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + .p2align 4 +LABEL(ashr_4_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(NTtail) + #palignr $4, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x04 - movnti %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %rdx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + #palignr $4, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x04 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(NTtail) + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_4_use_ssse3) - movnti %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %rdx + .p2align 4 +LABEL(ashr_4_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 + + psrldq $4, %xmm2 + pslldq $12, %xmm3 + por %xmm2, %xmm3 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 + + psrldq $4, %xmm2 + pslldq $12, %xmm3 + por %xmm2, %xmm3 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(NTtail) + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_4_use_sse2) - movnti %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %rdx +/* + * ashr_3 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 3 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_3): + xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_3_use_sse2) - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + .p2align 4 +LABEL(ashr_3_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(NTtail) + #palignr $3, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x03 - movnti %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - add $8, %rdx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - sub $8, %r11 - jle LABEL(tail) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - mov %rcx, %r8 - add %rax, %r8 - sbb %r10, %r10 + #palignr $3, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x03 - xor %rax, %r8 - or %rcx, %r8 - sub %r10, %r8 - jnz LABEL(NTtail) + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_3_use_ssse3) - movnti %rax, (%rdi, %rdx) - mov 8 (%rsi, %rdx), %rax - prefetchnta 768 + 8 (%rsi, %rdx) - add $8, %rdx + .p2align 4 +LABEL(ashr_3_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - jmp LABEL(NTloop) + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - .p2align 4 + psrldq $3, %xmm2 + pslldq $13, %xmm3 + por %xmm2, %xmm3 -LABEL(NTtail): - mfence + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx - .p2align 4 +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif -LABEL(NTafter): + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 -LABEL(tailtry): + psrldq $3, %xmm2 + pslldq $13, %xmm3 + por %xmm2, %xmm3 -LABEL(tail): /* 1-byte tail */ + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - add $8, %r11 + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + jmp LABEL(ashr_3_use_sse2) - .p2align 4 -LABEL(tailloop): +/* + * ashr_2 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 2 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_2): + xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY - dec %r11 - jl LABEL(exitn) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) #endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_2_use_sse2) - test %al, %al - mov %al, (%rdi, %rdx) - jz LABEL(exit) + .p2align 4 +LABEL(ashr_2_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - inc %rdx + #palignr $2, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x02 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + #palignr $2, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x02 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - dec %r11 - jl LABEL(exitn) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_2_use_ssse3) - mov %ah, %al + .p2align 4 +LABEL(ashr_2_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - test %ah, %ah - mov %ah, (%rdi, %rdx) - jz LABEL(exit) + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - inc %rdx + psrldq $2, %xmm2 + pslldq $14, %xmm3 + por %xmm2, %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY - dec %r11 - jl LABEL(exitn) + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif - shr $16, %rax + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 + + psrldq $2, %xmm2 + pslldq $14, %xmm3 + por %xmm2, %xmm3 - test %al, %al - mov %al, (%rdi, %rdx) - jz LABEL(exit) + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_2_use_sse2) - inc %rdx +/* + * ashr_1 handles the following cases: + * (16 + (src offset - dest offset)) % 16 = 1 + * + * Based on above operation, start from (%r9 + rsi) to the left of this cache + * bank, there is no null byte. + */ + .p2align 4 +LABEL(ashr_1): + xor %ecx, %ecx /* clear index */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ + jz LABEL(ashr_1_use_sse2) + + .p2align 4 +LABEL(ashr_1_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + #palignr $1, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x01 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + #palignr $1, (%rsi, %rcx), %xmm3 + .byte 0x66, 0x0F, 0x3A ,0x0F + .byte 0x1c, 0x0e, 0x01 + + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - dec %r11 - jl LABEL(exitn) + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_1_use_ssse3) - mov %ah, %al + .p2align 4 +LABEL(ashr_1_use_sse2): + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) #endif + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 - test %ah, %ah - mov %ah, (%rdi, %rdx) - jz LABEL(exit) + psrldq $1, %xmm2 + pslldq $15, %xmm3 + por %xmm2, %xmm3 - shr $16, %rax - inc %rdx + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx - jmp LABEL(tailloop) +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + pcmpeqb 16(%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif - .p2align 4 + movdqa 16(%rsi, %rcx), %xmm3 + movdqa (%rsi, %rcx), %xmm2 -LABEL(tailafter): + psrldq $1, %xmm2 + pslldq $15, %xmm3 + por %xmm2, %xmm3 -LABEL(exit): + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx #ifdef USE_AS_STRNCPY - test %r11, %r11 - mov %r11, %rcx + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_1_use_sse2) -#ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %r8 -#else - mov %rdi, %r8 + + /* + * Exit tail code: + * Up to 32 bytes are copied in the case of strcpy. + */ + .p2align 4 +LABEL(less32bytes): + xor %ecx, %ecx +LABEL(unaligned_exit): + add %r9, %rsi /* r9 holds offset of rsi */ + mov %rcx, %r9 + mov %r10, %rcx + shl %cl, %edx /* after shl, calculate the exact number to be filled */ + mov %r9, %rcx + .p2align 4 +LABEL(aligned_exit): + add %rcx, %rdi /* locate exact address for rdi */ +LABEL(less16bytes): + add %rcx, %rsi /* locate exact address for rsi */ +LABEL(aligned_16bytes): +#ifdef USE_AS_STRNCPY + /* + * Null found in 16bytes checked. Set bit in bitmask corresponding to + * the strncpy count argument. We will copy to the null (inclusive) + * or count whichever comes first. + */ + mov $1, %r9d + lea -1(%r8), %rcx + shl %cl, %r9d + cmp $32, %r8 + ja LABEL(strncpy_tail) + or %r9d, %edx +LABEL(strncpy_tail): #endif + /* + * Check to see if BSF is fast on this processor. If not, use a + * different exit tail. + */ + testb $USE_BSF, .memops_method(%rip) + jz LABEL(AMD_exit) + bsf %rdx, %rcx /* Find byte with null char */ + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx, 4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx - jz 2f +#ifdef USE_AS_STRNCPY + /* + * Count reached before null found. + */ + .p2align 4 +LABEL(less32bytes_strncpy_truncation): + xor %ecx, %ecx +LABEL(strncpy_truncation_unaligned): + add %r9, %rsi /* next src char to copy */ +LABEL(strncpy_truncation_aligned): + add %rcx, %rdi + add %rcx, %rsi + add $16, %r8 /* compensation */ + lea -1(%r8), %rcx + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx, 4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx - xor %eax, %eax /* bzero () would do too, but usually there are only a handfull of bytes left */ - shr $3, %rcx - lea 1 (%rdi, %rdx), %rdi - jz 1f + .p2align 4 +LABEL(strncpy_exitz): + mov %rdi, %rax + ret +#endif - rep stosq + .p2align 4 +LABEL(AMD_exit): + test %dl, %dl + jz LABEL(AMD_exit_more_8) + test $0x01, %dl + jnz LABEL(tail_0) + test $0x02, %dl + jnz LABEL(tail_1) + test $0x04, %dl + jnz LABEL(tail_2) + test $0x08, %dl + jnz LABEL(tail_3) + test $0x10, %dl + jnz LABEL(tail_4) + test $0x20, %dl + jnz LABEL(tail_5) + test $0x40, %dl + jnz LABEL(tail_6) -1: - mov %r11d, %ecx - and $7, %ecx - jz 2f + .p2align 4 +LABEL(tail_7): /* 8 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) +#ifdef USE_AS_STRNCPY + mov $8, %cl + sub $8, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret - .p2align 4,, 3 +#ifdef USE_AS_STRNCPY + /* + * Null terminated src string shorter than count. Fill the rest of the + * destination with null chars. + */ + .p2align 4 +LABEL(strncpy_fill_tail): + mov %rax, %rdx + movzx %cl, %rax + mov %r8, %rcx + add %rax, %rdi + xor %eax, %eax + shr $3, %ecx + jz LABEL(strncpy_fill_less_8) -3: - dec %ecx + rep stosq +LABEL(strncpy_fill_less_8): + mov %r8, %rcx + and $7, %rcx + jz LABEL(strncpy_fill_return) +LABEL(strncpy_fill_less_7): + sub $1, %ecx mov %al, (%rdi, %rcx) - jnz 3b + jnz LABEL(strncpy_fill_less_7) +LABEL(strncpy_fill_return): + mov %rdx, %rax + ret +#endif - .p2align 4,, 3 + .p2align 4 +LABEL(tail_0): /* 1 byte */ + mov (%rsi), %cl + mov %cl, (%rdi) +#ifdef USE_AS_STRNCPY + mov $1, %cl + sub $1, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret -2: - mov %r8, %rax - ret + .p2align 4 +LABEL(tail_1): /* 2 bytes */ + mov (%rsi), %cx + mov %cx, (%rdi) +#ifdef USE_AS_STRNCPY + mov $2, %cl + sub $2, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_2): /* 3 bytes */ + mov (%rsi), %cx + mov %cx, (%rdi) + mov 1(%rsi), %cx + mov %cx, 1(%rdi) +#ifdef USE_AS_STRNCPY + mov $3, %cl + sub $3, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + .p2align 4 +LABEL(tail_3): /* 4 bytes */ + mov (%rsi), %ecx + mov %ecx, (%rdi) +#ifdef USE_AS_STRNCPY + mov $4, %cl + sub $4, %r8 + jnz LABEL(strncpy_fill_tail) #endif + ret - .p2align 4 + .p2align 4 +LABEL(tail_4): /* 5 bytes */ + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 1(%rsi), %edx + mov %edx, 1(%rdi) +#ifdef USE_AS_STRNCPY + mov $5, %cl + sub $5, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret -LABEL(exitn): -#ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -#else - mov %rdi, %rax + .p2align 4 +LABEL(tail_5): /* 6 bytes */ + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 2(%rsi), %edx + mov %edx, 2(%rdi) +#ifdef USE_AS_STRNCPY + mov $6, %cl + sub $6, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_6): /* 7 bytes */ + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 3(%rsi), %edx + mov %edx,3(%rdi) +#ifdef USE_AS_STRNCPY + mov $7, %cl + sub $7, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_8): /* 9 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %edx + mov %edx, 5(%rdi) +#ifdef USE_AS_STRNCPY + mov $9, %cl + sub $9, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(AMD_exit_more_8): + test %dh, %dh + jz LABEL(AMD_exit_more_16) + test $0x01, %dh + jnz LABEL(tail_8) + test $0x02, %dh + jnz LABEL(tail_9) + test $0x04, %dh + jnz LABEL(tail_10) + test $0x08, %dh + jnz LABEL(tail_11) + test $0x10, %dh + jnz LABEL(tail_12) + test $0x20, %dh + jnz LABEL(tail_13) + test $0x40, %dh + jnz LABEL(tail_14) + + .p2align 4 +LABEL(tail_15): /* 16 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) +#ifdef USE_AS_STRNCPY + mov $16, %cl + sub $16, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_9): /* 10 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %edx + mov %edx, 6(%rdi) +#ifdef USE_AS_STRNCPY + mov $10, %cl + sub $10, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_10): /* 11 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %edx + mov %edx, 7(%rdi) +#ifdef USE_AS_STRNCPY + mov $11, %cl + sub $11, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_11): /* 12 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %edx + mov %edx, 8(%rdi) +#ifdef USE_AS_STRNCPY + mov $12, %cl + sub $12, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_12): /* 13 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %rcx + mov %rcx, 5(%rdi) +#ifdef USE_AS_STRNCPY + mov $13, %cl + sub $13, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_13): /* 14 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %rcx + mov %rcx, 6(%rdi) +#ifdef USE_AS_STRNCPY + mov $14, %cl + sub $14, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_14): /* 15 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %rcx + mov %rcx, 7(%rdi) +#ifdef USE_AS_STRNCPY + mov $15, %cl + sub $15, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(AMD_exit_more_16): + shr $16, %edx + test %dl, %dl + jz LABEL(AMD_exit_more_24) + test $0x01, %dl + jnz LABEL(tail_16) + test $0x02, %dl + jnz LABEL(tail_17) + test $0x04, %dl + jnz LABEL(tail_18) + test $0x08, %dl + jnz LABEL(tail_19) + test $0x10, %dl + jnz LABEL(tail_20) + test $0x20, %dl + jnz LABEL(tail_21) + test $0x40, %dl + jnz LABEL(tail_22) + + .p2align 4 +LABEL(tail_23): /* 24 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) +#ifdef USE_AS_STRNCPY + mov $24, %cl + sub $24, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_16): /* 17 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cl + mov %cl, 16(%rdi) +#ifdef USE_AS_STRNCPY + mov $17, %cl + sub $17, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_17): /* 18 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cx + mov %cx, 16(%rdi) +#ifdef USE_AS_STRNCPY + mov $18, %cl + sub $18, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_18): /* 19 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %ecx + mov %ecx,15(%rdi) +#ifdef USE_AS_STRNCPY + mov $19, %cl + sub $19, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_19): /* 20 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %ecx + mov %ecx, 16(%rdi) +#ifdef USE_AS_STRNCPY + mov $20, %cl + sub $20, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_20): /* 21 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 13(%rsi), %rcx + mov %rcx, 13(%rdi) +#ifdef USE_AS_STRNCPY + mov $21, %cl + sub $21, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_21): /* 22 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 14(%rsi), %rcx + mov %rcx, 14(%rdi) +#ifdef USE_AS_STRNCPY + mov $22, %cl + sub $22, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_22): /* 23 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %rcx + mov %rcx, 15(%rdi) +#ifdef USE_AS_STRNCPY + mov $23, %cl + sub $23, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(AMD_exit_more_24): + test $0x01, %dh + jnz LABEL(tail_24) + test $0x02, %dh + jnz LABEL(tail_25) + test $0x04, %dh + jnz LABEL(tail_26) + test $0x08, %dh + jnz LABEL(tail_27) + test $0x10, %dh + jnz LABEL(tail_28) + test $0x20, %dh + jnz LABEL(tail_29) + test $0x40, %dh + jnz LABEL(tail_30) + + .p2align 4 +LABEL(tail_31): /* 32 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %rdx + mov %rdx, 24(%rdi) +#ifdef USE_AS_STRNCPY + mov $32, %cl + sub $32, %r8 + jnz LABEL(strncpy_fill_tail) #endif + ret - ret + .p2align 4 +LABEL(tail_24): /* 25 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %edx + mov %edx, 21(%rdi) +#ifdef USE_AS_STRNCPY + mov $25, %cl + sub $25, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_25): /* 26 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %edx + mov %edx, 22(%rdi) +#ifdef USE_AS_STRNCPY + mov $26, %cl + sub $26, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_26): /* 27 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %edx + mov %edx, 23(%rdi) +#ifdef USE_AS_STRNCPY + mov $27, %cl + sub $27, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_27): /* 28 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %edx + mov %edx, 24(%rdi) +#ifdef USE_AS_STRNCPY + mov $28, %cl + sub $28, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_28): /* 29 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %rdx + mov %rdx, 21(%rdi) +#ifdef USE_AS_STRNCPY + mov $29, %cl + sub $29, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_29): /* 30 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %rdx + mov %rdx, 22(%rdi) +#ifdef USE_AS_STRNCPY + mov $30, %cl + sub $30, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .p2align 4 +LABEL(tail_30): /* 31 bytes */ + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %rdx + mov %rdx, 23(%rdi) +#ifdef USE_AS_STRNCPY + mov $31, %cl + sub $31, %r8 + jnz LABEL(strncpy_fill_tail) +#endif + ret + + .pushsection .rodata + .p2align 4 +LABEL(tail_table): + .int LABEL(tail_0) - LABEL(tail_table) /* 1 byte */ + .int LABEL(tail_1) - LABEL(tail_table) + .int LABEL(tail_2) - LABEL(tail_table) + .int LABEL(tail_3) - LABEL(tail_table) + .int LABEL(tail_4) - LABEL(tail_table) + .int LABEL(tail_5) - LABEL(tail_table) + .int LABEL(tail_6) - LABEL(tail_table) + .int LABEL(tail_7) - LABEL(tail_table) + .int LABEL(tail_8) - LABEL(tail_table) + .int LABEL(tail_9) - LABEL(tail_table) + .int LABEL(tail_10) - LABEL(tail_table) + .int LABEL(tail_11) - LABEL(tail_table) + .int LABEL(tail_12) - LABEL(tail_table) + .int LABEL(tail_13) - LABEL(tail_table) + .int LABEL(tail_14) - LABEL(tail_table) + .int LABEL(tail_15) - LABEL(tail_table) + .int LABEL(tail_16) - LABEL(tail_table) + .int LABEL(tail_17) - LABEL(tail_table) + .int LABEL(tail_18) - LABEL(tail_table) + .int LABEL(tail_19) - LABEL(tail_table) + .int LABEL(tail_20) - LABEL(tail_table) + .int LABEL(tail_21) - LABEL(tail_table) + .int LABEL(tail_22) - LABEL(tail_table) + .int LABEL(tail_23) - LABEL(tail_table) + .int LABEL(tail_24) - LABEL(tail_table) + .int LABEL(tail_25) - LABEL(tail_table) + .int LABEL(tail_26) - LABEL(tail_table) + .int LABEL(tail_27) - LABEL(tail_table) + .int LABEL(tail_28) - LABEL(tail_table) + .int LABEL(tail_29) - LABEL(tail_table) + .int LABEL(tail_30) - LABEL(tail_table) + .int LABEL(tail_31) - LABEL(tail_table) /* 32 bytes */ + + .p2align 4 +LABEL(unaligned_table): + .int LABEL(ashr_0) - LABEL(unaligned_table) + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .popsection #ifdef USE_AS_STRNCPY SET_SIZE(strncpy) #else - SET_SIZE(strcpy) /* (char *, const char *) */ + SET_SIZE(strcpy) /* (char *, const char *) */ #endif diff --git a/usr/src/lib/libc/amd64/gen/strlen.s b/usr/src/lib/libc/amd64/gen/strlen.s index e33009d3e1..3b41235678 100644 --- a/usr/src/lib/libc/amd64/gen/strlen.s +++ b/usr/src/lib/libc/amd64/gen/strlen.s @@ -1,430 +1,199 @@ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END */ - + /* - * Copyright (c) 2002 Advanced Micro Devices, Inc. - * + * Copyright (c) 2009, Intel Corporation * All rights reserved. - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the - * following conditions are met: - * - * + Redistributions of source code must retain the above - * copyright notice, this list of conditions and the - * following disclaimer. - * - * + Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the - * following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - * + Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or - * promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND - * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, - * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * It is licensee's responsibility to comply with any export - * regulations applicable in licensee's jurisdiction. */ - .file "strlen.s" +/* + * strlen - calculate the length of string + */ #include "SYS.h" -#include "cache.h" +#include "proc64_id.h" #define LABEL(s) .strlen/**/s - ENTRY(strlen) /* (const char *s) */ - - mov %rdi, %rsi - neg %rdi - -LABEL(aligntry): - mov %rsi , %r8 - and $7, %r8d - jz LABEL(alignafter) - -LABEL(align): /* 8-byte align */ - sub $8, %r8 - - .p2align 4 - -LABEL(alignloop): - cmpb $0, (%rsi) - je LABEL(exit) - - inc %rsi - inc %r8 - jnz LABEL(alignloop) - - .p2align 4 - -LABEL(alignafter): - -LABEL(56try): - -LABEL(56): /* 56-byte */ - mov (%rsi), %rax - mov $0xfefefefefefefeff, %rcx - -LABEL(56loop): - mov %rcx, %r8 - add %rax, %r8 - jnc LABEL(tail) - - xor %rax, %r8 - or %rcx, %r8 - inc %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - lea 8 (%rsi), %rsi - - mov %rcx, %r8 - add %rax, %r8 - jnc LABEL(tail) - - xor %rax, %r8 - or %rcx, %r8 - inc %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - lea 8 (%rsi), %rsi - - mov %rcx, %r8 - add %rax, %r8 - jnc LABEL(tail) - - xor %rax, %r8 - or %rcx, %r8 - inc %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - lea 8 (%rsi), %rsi - - mov %rcx, %r8 - add %rax, %r8 - jnc LABEL(tail) - - xor %rax, %r8 - or %rcx, %r8 - inc %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - lea 8 (%rsi), %rsi - - mov %rcx, %r8 - add %rax, %r8 - jnc LABEL(tail) - - xor %rax, %r8 - or %rcx, %r8 - inc %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - lea 8 (%rsi), %rsi - - mov %rcx, %r8 - add %rax, %r8 - jnc LABEL(tail) - - xor %rax, %r8 - or %rcx, %r8 - inc %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - lea 8 (%rsi), %rsi - - mov %rcx, %r8 - add %rax, %r8 - jnc LABEL(tail) - - xor %rax, %r8 - or %rcx, %r8 - inc %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - lea 8 (%rsi), %rsi - -LABEL(56after): - -LABEL(32): /* 32-byte */ - mov _sref_(.amd64cache1), %r9 - - .p2align 4 - -LABEL(32loop): - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - sub $32, %r9 - - mov 8 (%rsi), %rax - lea 8 (%rsi), %rsi - - jbe LABEL(32loop) - -LABEL(32after): - -LABEL(pretry): - -LABEL(pre): /* 64-byte prefetch */ - - .p2align 4 - -LABEL(preloop): - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - mov 8 (%rsi), %rax - add $8, %rsi - - mov %rcx, %r8 - add %rax, %r8 - sbb %rdx, %rdx - - xor %rax, %r8 - or %rcx, %r8 - sub %rdx, %r8 - jnz LABEL(tail) - - prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ - - mov 8 (%rsi), %rax - add $8, %rsi - - jmp LABEL(preloop) - - .p2align 4 - -LABEL(preafter): - -LABEL(tailtry): - -LABEL(tail): /* 4-byte tail */ - -LABEL(tailloop): - test %al, %al - jz LABEL(exit) - - inc %rsi - - test %ah, %ah - jz LABEL(exit) - - inc %rsi - - test $0x00ff0000, %eax - jz LABEL(exit) - - inc %rsi - - test $0xff000000, %eax - jz LABEL(exit) - - inc %rsi - - shr $32, %rax - jmp LABEL(tailloop) - -LABEL(tailafter): - - .p2align 4 - + /* + * This implementation uses SSE instructions to compare up to 16 bytes + * at a time looking for the end of string (null char). + */ + ENTRY(strlen) /* (const char *s) */ + mov %rdi, %rsi /* keep original %rdi value */ + mov %rsi, %rcx + pxor %xmm0, %xmm0 /* 16 null chars */ + and $15, %rcx + jz LABEL(align16_loop) /* string is 16 byte aligned */ + + /* + * Unaligned case. Round down to 16-byte boundary before comparing + * 16 bytes for a null char. The code then compensates for any extra chars + * preceding the start of the string. + */ +LABEL(unalign16): + and $0xfffffffffffffff0, %rsi + + pcmpeqb (%rsi), %xmm0 + lea 16(%rdi), %rsi + pmovmskb %xmm0, %edx + + shr %cl, %edx /* Compensate for bytes preceding the string */ + test %edx, %edx + jnz LABEL(exit) + sub %rcx, %rsi /* no null, adjust to next 16-byte boundary */ + pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */ + + .p2align 4 +LABEL(align16_loop): /* 16 byte aligned */ + pcmpeqb (%rsi), %xmm0 /* look for null bytes */ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */ + + add $16, %rsi /* prepare to search next 16 bytes */ + test %edx, %edx /* if no null byte, %edx must be 0 */ + jnz LABEL(exit) /* found a null */ + + pcmpeqb (%rsi), %xmm0 + pmovmskb %xmm0, %edx + add $16, %rsi + test %edx, %edx + jnz LABEL(exit) + + pcmpeqb (%rsi), %xmm0 + pmovmskb %xmm0, %edx + add $16, %rsi + test %edx, %edx + jnz LABEL(exit) + + pcmpeqb (%rsi), %xmm0 + pmovmskb %xmm0, %edx + add $16, %rsi + test %edx, %edx + jz LABEL(align16_loop) + + .p2align 4 LABEL(exit): - lea (%rdi, %rsi), %rax - ret - + neg %rdi + /* + * Check to see if BSF is fast on this processor. If not, use a different + * exit tail to find first bit set indicating null byte match. + */ + testl $USE_BSF, .memops_method(%rip) + jz LABEL(AMD_exit) + + lea -16(%rdi, %rsi), %rax /* calculate exact offset */ + bsf %edx, %ecx /* Least significant 1 bit is index of null */ + lea (%rax, %rcx),%rax + ret + + /* + * This exit tail does not use the bsf instruction. + */ + .p2align 4 +LABEL(AMD_exit): + lea -16(%rdi, %rsi), %rax + test %dl, %dl + jz LABEL(exit_high) + test $0x01, %dl + jnz LABEL(exit_tail0) + + test $0x02, %dl + jnz LABEL(exit_tail1) + + .p2align 4 + test $0x04, %dl + jnz LABEL(exit_tail2) + + test $0x08, %dl + jnz LABEL(exit_tail3) + + test $0x10, %dl + jnz LABEL(exit_tail4) + + test $0x20, %dl + jnz LABEL(exit_tail5) + + test $0x40, %dl + jnz LABEL(exit_tail6) + add $7, %rax + ret + + .p2align 4 +LABEL(exit_high): + add $8, %rax + test $0x01, %dh + jnz LABEL(exit_tail0) + + test $0x02, %dh + jnz LABEL(exit_tail1) + + test $0x04, %dh + jnz LABEL(exit_tail2) + + test $0x08, %dh + jnz LABEL(exit_tail3) + + test $0x10, %dh + jnz LABEL(exit_tail4) + + test $0x20, %dh + jnz LABEL(exit_tail5) + + test $0x40, %dh + jnz LABEL(exit_tail6) + add $7, %rax + ret + + .p2align 4 +LABEL(exit_tail0): + xor %ecx, %ecx + ret + + .p2align 4 +LABEL(exit_tail1): + add $1, %rax + ret + + .p2align 4 +LABEL(exit_tail2): + add $2, %rax + ret + + .p2align 4 +LABEL(exit_tail3): + add $3, %rax + ret + + .p2align 4 +LABEL(exit_tail4): + add $4, %rax + ret + + .p2align 4 +LABEL(exit_tail5): + add $5, %rax + ret + + .p2align 4 +LABEL(exit_tail6): + add $6, %rax + ret SET_SIZE(strlen) |