summaryrefslogtreecommitdiff
path: root/usr/src/lib/libc
diff options
context:
space:
mode:
authorEdward Gillett <Edward.Gillett@Sun.COM>2009-09-18 14:25:49 -0700
committerEdward Gillett <Edward.Gillett@Sun.COM>2009-09-18 14:25:49 -0700
commit533d3a4910febc9985154b885dbe971e3c21ca04 (patch)
treec0161751a7f2919384220e54bcb6b7c5f7a8ade4 /usr/src/lib/libc
parenta28e62acca32a4c261beff5ecfd9a094a053e145 (diff)
downloadillumos-joyent-533d3a4910febc9985154b885dbe971e3c21ca04.tar.gz
6869408 64-bit libc string functions could be improved with SSE
Contributed by Ling Ma <ling.ma@intel.com>, Xinping Huang <xinping.huang@intel.com> and Robert Kasten <robert.a.kasten@intel.com>
Diffstat (limited to 'usr/src/lib/libc')
-rw-r--r--usr/src/lib/libc/amd64/gen/proc64_id.c3
-rw-r--r--usr/src/lib/libc/amd64/gen/proc64_id.h5
-rw-r--r--usr/src/lib/libc/amd64/gen/proc64_support.s4
-rw-r--r--usr/src/lib/libc/amd64/gen/strcmp.s2223
-rw-r--r--usr/src/lib/libc/amd64/gen/strcpy.s2834
-rw-r--r--usr/src/lib/libc/amd64/gen/strlen.s603
6 files changed, 4335 insertions, 1337 deletions
diff --git a/usr/src/lib/libc/amd64/gen/proc64_id.c b/usr/src/lib/libc/amd64/gen/proc64_id.c
index eac045037d..656244b4ad 100644
--- a/usr/src/lib/libc/amd64/gen/proc64_id.c
+++ b/usr/src/lib/libc/amd64/gen/proc64_id.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2008, Intel Corporation.
+ * Copyright (c) 2009, Intel Corporation.
* All rights reserved.
*/
@@ -226,6 +226,7 @@ __proc64id(void)
if (cpuid_info.edx & CPUID_INTC_EDX_SSE2) {
use_sse |= USE_SSE2;
}
+ use_sse |= USE_BSF;
__intel_set_memops_method(use_sse);
} else {
__set_cache_sizes(INTEL_DFLT_L1_CACHE_SIZE,
diff --git a/usr/src/lib/libc/amd64/gen/proc64_id.h b/usr/src/lib/libc/amd64/gen/proc64_id.h
index 8722e7ff5a..98a00bfa85 100644
--- a/usr/src/lib/libc/amd64/gen/proc64_id.h
+++ b/usr/src/lib/libc/amd64/gen/proc64_id.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2008, Intel Corporation
+ * Copyright (c) 2009, Intel Corporation
* All rights reserved.
*/
@@ -38,7 +38,7 @@ extern "C" {
#endif
/*
- * Defines to determine what SSE instructions can be used for memops or strops.
+ * Defines to determine what SSE instructions can be used for memops or strops
*/
#define NO_SSE 0x00 /* Default -- Don't use SSE instructions */
#define USE_SSE2 0x01 /* SSE2 */
@@ -46,6 +46,7 @@ extern "C" {
#define USE_SSSE3 0x04 /* Supplemental SSE3 */
#define USE_SSE4_1 0x08 /* SSE 4.1 */
#define USE_SSE4_2 0x10 /* SSE 4.2 */
+#define USE_BSF 0x20 /* USE BSF class of instructions */
/*
* Cache size defaults for Core 2 Duo
diff --git a/usr/src/lib/libc/amd64/gen/proc64_support.s b/usr/src/lib/libc/amd64/gen/proc64_support.s
index 8f499acc38..37a48ee029 100644
--- a/usr/src/lib/libc/amd64/gen/proc64_support.s
+++ b/usr/src/lib/libc/amd64/gen/proc64_support.s
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2008, Intel Corporation
+ * Copyright (c) 2009, Intel Corporation
* All rights reserved.
*/
@@ -38,8 +38,6 @@
* cache size information. Cache information used by memset, strcpy, etc..
*/
- .file "proc64_support.s"
-
#include <sys/asm_linkage.h>
#include "proc64_id.h"
diff --git a/usr/src/lib/libc/amd64/gen/strcmp.s b/usr/src/lib/libc/amd64/gen/strcmp.s
index 13532e2b47..8d04a52534 100644
--- a/usr/src/lib/libc/amd64/gen/strcmp.s
+++ b/usr/src/lib/libc/amd64/gen/strcmp.s
@@ -1,539 +1,2048 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
*/
/*
- * Copyright (c) 2002 Advanced Micro Devices, Inc.
- *
+ * Copyright (c) 2009, Intel Corporation
* All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the
- * following conditions are met:
- *
- * + Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the
- * following disclaimer.
- *
- * + Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the
- * following disclaimer in the documentation and/or other
- * materials provided with the distribution.
- *
- * + Neither the name of Advanced Micro Devices, Inc. nor the
- * names of its contributors may be used to endorse or
- * promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
- * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES,
- * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
- * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * It is licensee's responsibility to comply with any export
- * regulations applicable in licensee's jurisdiction.
*/
- .file "strcmp.s"
+/*
+ * str[n]cmp - compare chars between two string
+ */
#include "SYS.h"
-#include "cache.h"
+#include "proc64_id.h"
#define LABEL(s) .strcmp/**/s
#ifdef USE_AS_STRNCMP
+ /*
+ * Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+ * if the new counter > the old one or is 0.
+ */
+#define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ lea -16(%rcx, %r11), %r9; \
+ cmp %r9, %r11; \
+ jb LABEL(strcmp_exitz); \
+ test %r9, %r9; \
+ je LABEL(strcmp_exitz); \
+ mov %r9, %r11
+#else
+#define UPDATE_STRNCMP_COUNTER
+#endif
+
+ /*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCMP
ENTRY(strncmp)
+ test %rdx, %rdx
+ je LABEL(strcmp_exitz)
+ mov %rdx, %r11
#else
ENTRY(strcmp) /* (const char *, const char *) */
#endif
- xor %ecx, %ecx
+ mov %esi, %ecx
+ mov %edi, %eax
+ and $0x3f, %rcx /* rsi alignment in cache line */
+ and $0x3f, %rax /* rdi alignment in cache line */
+ cmp $0x30, %ecx
+ ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
+ cmp $0x30, %eax
+ ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
+ movlpd (%rdi), %xmm1
+ movlpd (%rsi), %xmm2
+ movhpd 8(%rdi), %xmm1
+ movhpd 8(%rsi), %xmm2
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
+ jnz LABEL(less16bytes) /* If not, found mismatch or null char */
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz) /* finish comparision */
+#endif
+ add $16, %rsi /* prepare to search next 16 bytes */
+ add $16, %rdi /* prepare to search next 16 bytes */
+
+ /*
+ * Determine rdi and rsi string offsets from 16-byte alignment.
+ * Use relative offset difference between the two to determine which case
+ * below to use.
+ */
+ .p2align 4
+LABEL(crosscache):
+ and $0xfffffffffffffff0, %rsi /* force %rsi to be 16 byte aligned */
+ and $0xfffffffffffffff0, %rdi /* force %rdi to be 16 byte aligned */
+ mov $0xffff, %edx /* for equivalent offset */
+ xor %r8d, %r8d
+ and $0xf, %ecx /* offset of rsi */
+ and $0xf, %eax /* offset of rdi */
+ cmp %eax, %ecx
+ je LABEL(ashr_0) /* both strings have the same alignment */
+ ja LABEL(bigger)
+ mov %edx, %r8d /* r8d is offset flag for exit tail */
+ xchg %ecx, %eax
+ xchg %rsi, %rdi
+LABEL(bigger):
+ mov %rcx, %r9
+ sub %rax, %r9
+ lea LABEL(unaligned_table)(%rip), %r10
+ movslq (%r10, %r9, 4), %r9
+ lea (%r10, %r9), %r10
+ jmp *%r10 /* jump to corresponding case */
+/*
+ * ashr_0 handles the following cases:
+ * str1 offset = str2 offset
+ */
+ .p2align 4
+LABEL(ashr_0):
+ movdqa (%rsi), %xmm1
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ /*
+ * edx must be the same with r9d if in left byte (16-rcx) is equal to
+ * the start from (16-rax) and no null char was seen.
+ */
+ jne LABEL(less32bytes) /* mismatch or null char */
+ UPDATE_STRNCMP_COUNTER
+ mov $16, %rcx
+ mov $16, %r9
+ pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
+
+ /*
+ * Now both strings are aligned at 16-byte boundary. Loop over strings
+ * checking 32-bytes per iteration.
+ */
+ .p2align 4
+LABEL(loop_ashr_0):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit) /* mismatch or null char seen */
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- test %rdx, %rdx /* (const char *, const char *, size_t) */
- mov %r14, -8 (%rsp)
- mov %rdx, %r14
- mov %edx, %eax
- jz LABEL(exitz) /* early exit */
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ add $16, %rcx
+ jmp LABEL(loop_ashr_0)
-LABEL(aligntry):
- mov %rsi, %r8 /* align by "source" */
- and $8 - 1, %r8 /* between 0 and 8 characters compared */
- jz LABEL(alignafter)
+/*
+ * ashr_1 handles the following cases:
+ * abs(str1 offset - str2 offset) = 15
+ */
+ .p2align 4
+LABEL(ashr_1):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pslldq $15, %xmm2 /* shift first string to align with second */
+ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ jnz LABEL(less32bytes) /* mismatch or null char seen */
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $1, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 1(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_1):
+ add $16, %r10
+ jg LABEL(nibble_ashr_1) /* cross page boundary */
+
+LABEL(gobble_ashr_1):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ psrldq $1, %xmm3
+ pslldq $15, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
-LABEL(align):
- sub $8, %r8
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_1) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ psrldq $1, %xmm3
+ pslldq $15, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_1)
- .p2align 4
+ /*
+ * Nibble avoids loads across page boundary. This is to avoid a potential
+ * access into unmapped memory.
+ */
+ .p2align 4
+LABEL(nibble_ashr_1):
+ psrldq $1, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x7fff, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $15, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_1)
-LABEL(alignloop):
- mov (%rsi, %rcx), %al
- mov (%rdi, %rcx), %dl
+/*
+ * ashr_2 handles the following cases:
+ * abs(str1 offset - str2 offset) = 14
+ */
+ .p2align 4
+LABEL(ashr_2):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $14, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $2, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 2(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_2):
+ add $16, %r10
+ jg LABEL(nibble_ashr_2)
+
+LABEL(gobble_ashr_2):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $2, %xmm3
+ pslldq $14, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- dec %r14
- jl LABEL(exitafter)
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- cmp %dl, %al /* check if same character */
- jne LABEL(exitafter)
- test %al, %al /* check if character a NUL */
- jz LABEL(exitafter)
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_2) /* cross page boundary */
- inc %ecx
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
- inc %r8
- jnz LABEL(alignloop)
+ psrldq $2, %xmm3
+ pslldq $14, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_2)
+
+ .p2align 4
+LABEL(nibble_ashr_2):
+ psrldq $2, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x3fff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- test %r14, %r14
- jz LABEL(exitafter)
+ cmp $14, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_2)
- .p2align 4
+/*
+ * ashr_3 handles the following cases:
+ * abs(str1 offset - str2 offset) = 13
+ */
+ .p2align 4
+LABEL(ashr_3):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $13, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $3, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 3(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_3):
+ add $16, %r10
+ jg LABEL(nibble_ashr_3)
+
+LABEL(gobble_ashr_3):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $3, %xmm3
+ pslldq $13, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
-LABEL(alignafter):
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
- mov %r15, -32 (%rsp)
- mov %rbp, -24 (%rsp)
- mov %rbx, -16 (%rsp)
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
-LABEL(pagealigntry): /* page align by "destination" */
- lea (%rdi, %rcx), %ebp
- mov $AMD64PAGESIZE, %r15d
- and $AMD64PAGEMASK, %ebp
- sub %r15d, %ebp
+ add $16, %r10
+ jg LABEL(nibble_ashr_3) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $3, %xmm3
+ pslldq $13, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_3)
+
+ .p2align 4
+LABEL(nibble_ashr_3):
+ psrldq $3, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x1fff, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $13, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_3)
+
+/*
+ * ashr_4 handles the following cases:
+ * abs(str1 offset - str2 offset) = 12
+ */
+ .p2align 4
+LABEL(ashr_4):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $12, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $4, %r9d /* rdi bytes already examined. Used in exit code */
/*
- * When we go to 64gobble, %ebp was adjusted at the top of 64loop.
- * When we go to 64nibble(crossing page boundary), we'll compare
- * 128 byte since we'll fall through to 64gobble. Therefore, %ebp
- * needs to be re-adjusted(add 64) when we fall into 64nibble.
- * It can be done by adjusting %r15 since %r15 is only used to
- * rewind %ebp when crossing page boundary.
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
*/
- sub $64, %r15d
+ lea 4(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_4):
+ add $16, %r10
+ jg LABEL(nibble_ashr_4)
+
+LABEL(gobble_ashr_4):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $4, %xmm3
+ pslldq $12, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
-LABEL(64): /* 64-byte */
- mov $0xfefefefefefefeff, %rbx /* magic number */
+ add $16, %r10
+ jg LABEL(nibble_ashr_4) /* cross page boundary */
- .p2align 4
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
-LABEL(64loop):
- add $64, %ebp /* check if "destination" crosses a page unevenly */
- jle LABEL(64gobble)
+ psrldq $4, %xmm3
+ pslldq $12, %xmm2
+ por %xmm3, %xmm2
- sub %r15d, %ebp
- lea 64 (%rcx), %r8
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- .p2align 4
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_4)
+
+ .p2align 4
+LABEL(nibble_ashr_4):
+ psrldq $4, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x0fff, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $12, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_4)
-LABEL(64nibble):
- mov (%rsi, %rcx), %al
- mov (%rdi, %rcx), %dl
+/*
+ * ashr_5 handles the following cases:
+ * abs(str1 offset - str2 offset) = 11
+ */
+ .p2align 4
+LABEL(ashr_5):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $11, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $5, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 5(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_5):
+ add $16, %r10
+ jg LABEL(nibble_ashr_5)
+
+LABEL(gobble_ashr_5):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $5, %xmm3
+ pslldq $11, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- dec %r14
- jle LABEL(exit)
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- cmp %dl, %al /* check if same character */
- jne LABEL(exit)
- test %al, %al /* check if character a NUL */
- jz LABEL(exit)
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_5) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $5, %xmm3
+ pslldq $11, %xmm2
+ por %xmm3, %xmm2
- inc %ecx
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- cmp %ecx, %r8d
- ja LABEL(64nibble)
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
- .p2align 4
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_5)
+
+ .p2align 4
+LABEL(nibble_ashr_5):
+ psrldq $5, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x07ff, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $11, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_5)
-LABEL(64gobble):
- mov (%rsi, %rcx), %rax
- mov (%rdi, %rcx), %rdx
+/*
+ * ashr_6 handles the following cases:
+ * abs(str1 offset - str2 offset) = 10
+ */
+ .p2align 4
+LABEL(ashr_6):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $10, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $6, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 6(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_6):
+ add $16, %r10
+ jg LABEL(nibble_ashr_6)
+
+LABEL(gobble_ashr_6):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $6, %xmm3
+ pslldq $10, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- sub $8, %r14
- jle LABEL(tail)
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- mov %rbx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_6) /* cross page boundary */
- mov %rbx, %r9
- add %rdx, %r9
- sbb %r11, %r11
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
- xor %rax, %r8
- or %rbx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $6, %xmm3
+ pslldq $10, %xmm2
+ por %xmm3, %xmm2
- xor %rdx, %r9
- or %rbx, %r9
- sub %r11, %r9
- jnz LABEL(tail)
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- cmp %rdx, %rax
- jne LABEL(tail)
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_6)
+
+ .p2align 4
+LABEL(nibble_ashr_6):
+ psrldq $6, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x03ff, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $10, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_6)
- mov 8 (%rsi, %rcx), %rax
- mov 8 (%rdi, %rcx), %rdx
- add $8, %ecx
+/*
+ * ashr_7 handles the following cases:
+ * abs(str1 offset - str2 offset) = 9
+ */
+ .p2align 4
+LABEL(ashr_7):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $9, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $7, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 7(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_7):
+ add $16, %r10
+ jg LABEL(nibble_ashr_7)
+
+LABEL(gobble_ashr_7):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $7, %xmm3
+ pslldq $9, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- sub $8, %r14
- jle LABEL(tail)
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- mov %rbx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
- mov %rbx, %r9
- add %rdx, %r9
- sbb %r11, %r11
+ add $16, %r10
+ jg LABEL(nibble_ashr_7) /* cross page boundary */
- xor %rax, %r8
- or %rbx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
- xor %rdx, %r9
- or %rbx, %r9
- sub %r11, %r9
- jnz LABEL(tail)
+ psrldq $7, %xmm3
+ pslldq $9, %xmm2
+ por %xmm3, %xmm2
- cmp %rdx, %rax
- jne LABEL(tail)
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- mov 8 (%rsi, %rcx), %rax
- mov 8 (%rdi, %rcx), %rdx
- add $8, %ecx
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_7)
+
+ .p2align 4
+LABEL(nibble_ashr_7):
+ psrldq $7, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x01ff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- sub $8, %r14
- jle LABEL(tail)
+ cmp $9, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_7)
+
+/*
+ * ashr_8 handles the following cases:
+ * abs(str1 offset - str2 offset) = 8
+ */
+ .p2align 4
+LABEL(ashr_8):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $8, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $8, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 8(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_8):
+ add $16, %r10
+ jg LABEL(nibble_ashr_8)
+
+LABEL(gobble_ashr_8):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $8, %xmm3
+ pslldq $8, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- mov %rbx, %r8
- add %rax, %r8
- sbb %r10, %r10
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
- mov %rbx, %r9
- add %rdx, %r9
- sbb %r11, %r11
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
- xor %rax, %r8
- or %rbx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ add $16, %r10
+ jg LABEL(nibble_ashr_8) /* cross page boundary */
- xor %rdx, %r9
- or %rbx, %r9
- sub %r11, %r9
- jnz LABEL(tail)
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
- cmp %rdx, %rax
- jne LABEL(tail)
+ psrldq $8, %xmm3
+ pslldq $8, %xmm2
+ por %xmm3, %xmm2
- mov 8 (%rsi, %rcx), %rax
- mov 8 (%rdi, %rcx), %rdx
- add $8, %ecx
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- sub $8, %r14
- jle LABEL(tail)
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- mov %rbx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_8)
+
+ .p2align 4
+LABEL(nibble_ashr_8):
+ psrldq $8, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x00ff, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $8, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_8)
+
+/*
+ * ashr_9 handles the following cases:
+ * abs(str1 offset - str2 offset) = 7
+ */
+ .p2align 4
+LABEL(ashr_9):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $7, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $9, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 9(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_9):
+ add $16, %r10
+ jg LABEL(nibble_ashr_9)
+
+LABEL(gobble_ashr_9):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $9, %xmm3
+ pslldq $7, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_9) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $9, %xmm3
+ pslldq $7, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3 /* store for next cycle */
+ jmp LABEL(loop_ashr_9)
+
+ .p2align 4
+LABEL(nibble_ashr_9):
+ psrldq $9, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x007f, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $7, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_9)
+
+/*
+ * ashr_10 handles the following cases:
+ * abs(str1 offset - str2 offset) = 6
+ */
+ .p2align 4
+LABEL(ashr_10):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $6, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $10, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 10(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_10):
+ add $16, %r10
+ jg LABEL(nibble_ashr_10)
+
+LABEL(gobble_ashr_10):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $10, %xmm3
+ pslldq $6, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
- mov %rbx, %r9
- add %rdx, %r9
- sbb %r11, %r11
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
- xor %rax, %r8
- or %rbx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ add $16, %r10
+ jg LABEL(nibble_ashr_10) /* cross page boundary */
- xor %rdx, %r9
- or %rbx, %r9
- sub %r11, %r9
- jnz LABEL(tail)
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
- cmp %rdx, %rax
- jne LABEL(tail)
+ psrldq $10, %xmm3
+ pslldq $6, %xmm2
+ por %xmm3, %xmm2
- mov 8 (%rsi, %rcx), %rax
- mov 8 (%rdi, %rcx), %rdx
- add $8, %ecx
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- sub $8, %r14
- jle LABEL(tail)
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- mov %rbx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_10)
+
+ .p2align 4
+LABEL(nibble_ashr_10):
+ psrldq $10, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x003f, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $6, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_10)
+
+/*
+ * ashr_11 handles the following cases:
+ * abs(str1 offset - str2 offset) = 5
+ */
+ .p2align 4
+LABEL(ashr_11):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $5, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $11, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 11(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_11):
+ add $16, %r10
+ jg LABEL(nibble_ashr_11)
+
+LABEL(gobble_ashr_11):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $11, %xmm3
+ pslldq $5, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- mov %rbx, %r9
- add %rdx, %r9
- sbb %r11, %r11
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_11) /* cross page boundary */
- xor %rax, %r8
- or %rbx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
- xor %rdx, %r9
- or %rbx, %r9
- sub %r11, %r9
- jnz LABEL(tail)
+ psrldq $11, %xmm3
+ pslldq $5, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
- cmp %rdx, %rax
- jne LABEL(tail)
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_11)
+
+ .p2align 4
+LABEL(nibble_ashr_11):
+ psrldq $11, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x001f, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $5, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_11)
- mov 8 (%rsi, %rcx), %rax
- mov 8 (%rdi, %rcx), %rdx
- add $8, %ecx
+/*
+ * ashr_12 handles the following cases:
+ * abs(str1 offset - str2 offset) = 4
+ */
+ .p2align 4
+LABEL(ashr_12):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $4, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $12, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 12(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_12):
+ add $16, %r10
+ jg LABEL(nibble_ashr_12)
+
+LABEL(gobble_ashr_12):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $12, %xmm3
+ pslldq $4, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- sub $8, %r14
- jle LABEL(tail)
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- mov %rbx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
- mov %rbx, %r9
- add %rdx, %r9
- sbb %r11, %r11
+ add $16, %r10
+ jg LABEL(nibble_ashr_12) /* cross page boundary */
- xor %rax, %r8
- or %rbx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
- xor %rdx, %r9
- or %rbx, %r9
- sub %r11, %r9
- jnz LABEL(tail)
+ psrldq $12, %xmm3
+ pslldq $4, %xmm2
+ por %xmm3, %xmm2
- cmp %rdx, %rax
- jne LABEL(tail)
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- mov 8 (%rsi, %rcx), %rax
- mov 8 (%rdi, %rcx), %rdx
- add $8, %ecx
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_12)
+
+ .p2align 4
+LABEL(nibble_ashr_12):
+ psrldq $12, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x000f, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $4, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_12)
+
+/*
+ * ashr_13 handles the following cases:
+ * abs(str1 offset - str2 offset) = 3
+ */
+ .p2align 4
+LABEL(ashr_13):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $3, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $13, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 13(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_13):
+ add $16, %r10
+ jg LABEL(nibble_ashr_13)
+
+LABEL(gobble_ashr_13):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $13, %xmm3
+ pslldq $3, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- sub $8, %r14
- jle LABEL(tail)
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- mov %rbx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
- mov %rbx, %r9
- add %rdx, %r9
- sbb %r11, %r11
+ add $16, %r10
+ jg LABEL(nibble_ashr_13) /* cross page boundary */
- xor %rax, %r8
- or %rbx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
- xor %rdx, %r9
- or %rbx, %r9
- sub %r11, %r9
- jnz LABEL(tail)
+ psrldq $13, %xmm3
+ pslldq $3, %xmm2
+ por %xmm3, %xmm2
- cmp %rdx, %rax
- jne LABEL(tail)
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- mov 8 (%rsi, %rcx), %rax
- mov 8 (%rdi, %rcx), %rdx
- add $8, %ecx
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_13)
+
+ .p2align 4
+LABEL(nibble_ashr_13):
+ psrldq $13, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x0007, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- sub $8, %r14
- jle LABEL(tail)
+ cmp $3, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_13)
+
+/*
+ * ashr_14 handles the following cases:
+ * abs(str1 offset - str2 offset) = 2
+ */
+ .p2align 4
+LABEL(ashr_14):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $2, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $14, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 14(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_14):
+ add $16, %r10
+ jg LABEL(nibble_ashr_14)
+
+LABEL(gobble_ashr_14):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $14, %xmm3
+ pslldq $2, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- mov %rbx, %r8
- add %rax, %r8
- sbb %r10, %r10
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
- mov %rbx, %r9
- add %rdx, %r9
- sbb %r11, %r11
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
- xor %rax, %r8
- or %rbx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ add $16, %r10
+ jg LABEL(nibble_ashr_14) /* cross page boundary */
- xor %rdx, %r9
- or %rbx, %r9
- sub %r11, %r9
- jnz LABEL(tail)
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
- cmp %rdx, %rax
- jne LABEL(tail)
+ psrldq $14, %xmm3
+ pslldq $2, %xmm2
+ por %xmm3, %xmm2
- add $8, %ecx
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
- jmp LABEL(64loop)
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
-LABEL(64after):
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_14)
+
+ .p2align 4
+LABEL(nibble_ashr_14):
+ psrldq $14, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x0003, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $2, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_14)
-LABEL(tailtry):
+/*
+ * ashr_15 handles the following cases:
+ * abs(str1 offset - str2 offset) = 1
+ */
+ .p2align 4
+LABEL(ashr_15):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $1, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $15, %r9d /* rdi bytes already examined. Used in exit code */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we are crossing a page boundary and
+ * need to do a nibble.
+ */
+ lea 15(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ movdqa %xmm3, %xmm4
+
+ .p2align 4
+LABEL(loop_ashr_15):
+ add $16, %r10
+ jg LABEL(nibble_ashr_15)
+
+LABEL(gobble_ashr_15):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $15, %xmm3
+ pslldq $1, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
-LABEL(tail): /* byte tail */
#ifdef USE_AS_STRNCMP
- add $7, %r14
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- cmp %dl, %al /* check if same character */
- jne LABEL(exit)
- test %al, %al /* check if character a NUL */
- jz LABEL(exit)
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_15) /* cross page boundary */
- shr $8, %rax
- shr $8, %rdx
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $15, %xmm3
+ pslldq $1, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
#ifdef USE_AS_STRNCMP
- dec %r14
- jl LABEL(exit)
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- cmp %dl, %al
- jne LABEL(exit)
- test %al, %al
- jz LABEL(exit)
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_15)
+
+ .p2align 4
+LABEL(nibble_ashr_15):
+ psrldq $15, %xmm4
+ movdqa (%rsi, %rcx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0x0001, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $1, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* subtract 4K from %r10 */
+ jmp LABEL(gobble_ashr_15)
+
+ .p2align 4
+LABEL(exit):
+ lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
+LABEL(less32bytes):
+ lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
+ lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
+ test %r8d, %r8d
+ jz LABEL(ret)
+ xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
+
+ .p2align 4
+LABEL(ret):
+LABEL(less16bytes):
+ /*
+ * Check to see if BSF is fast on this processor. If not, use a different
+ * exit tail.
+ */
+ testl $USE_BSF,.memops_method(%rip)
+ jz LABEL(AMD_exit)
+ bsf %rdx, %rdx /* find and store bit index in %rdx */
+
+#ifdef USE_AS_STRNCMP
+ sub %rdx, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ xor %ecx, %ecx /* clear %ecx */
+ xor %eax, %eax /* clear %eax */
+
+ movb (%rsi, %rdx), %cl
+ movb (%rdi, %rdx), %al
- shr $8, %rax
- shr $8, %rdx
+ sub %ecx, %eax
+ ret
#ifdef USE_AS_STRNCMP
- dec %r14
- jl LABEL(exit)
+LABEL(strcmp_exitz):
+ xor %eax, %eax
+ ret
#endif
- cmp %dl, %al
- jne LABEL(exit)
- test %al, %al
- jz LABEL(exit)
+ /*
+ * This exit tail does not use the bsf instruction.
+ */
+ .p2align 4
+LABEL(AMD_exit):
+ test %dl, %dl
+ jz LABEL(next_8_bytes)
+
+ test $0x01, %dl
+ jnz LABEL(Byte0)
+
+ test $0x02, %dl
+ jnz LABEL(Byte1)
+
+ test $0x04, %dl
+ jnz LABEL(Byte2)
+
+ test $0x08, %dl
+ jnz LABEL(Byte3)
+
+ test $0x10, %dl
+ jnz LABEL(Byte4)
+
+ test $0x20, %dl
+ jnz LABEL(Byte5)
+
+ test $0x40, %dl
+ jnz LABEL(Byte6)
+
+#ifdef USE_AS_STRNCMP
+ sub $7, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ movzx 7(%rsi), %ecx
+ movzx 7(%rdi), %eax
- shr $8, %rax
- shr $8, %rdx
+ sub %ecx, %eax
+ ret
+ .p2align 4
+LABEL(Byte0):
+ /*
+ * never need to handle byte 0 for strncmpy
#ifdef USE_AS_STRNCMP
- dec %r14
- jl LABEL(exit)
+ sub $0, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ */
+ movzx (%rsi), %ecx
+ movzx (%rdi), %eax
- cmp %dl, %al
- jne LABEL(exit)
- test %al, %al
- jz LABEL(exit)
+ sub %ecx, %eax
+ ret
- shr $8, %rax
- shr $8, %rdx
+ .p2align 4
+LABEL(Byte1):
#ifdef USE_AS_STRNCMP
- dec %r14
- jl LABEL(exit)
+ sub $1, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ movzx 1(%rsi), %ecx
+ movzx 1(%rdi), %eax
- cmp %dl, %al
- jne LABEL(exit)
- test %al, %al
- jz LABEL(exit)
+ sub %ecx, %eax
+ ret
- shr $8, %eax
- shr $8, %edx
+ .p2align 4
+LABEL(Byte2):
#ifdef USE_AS_STRNCMP
- dec %r14
- jl LABEL(exit)
+ sub $2, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ movzx 2(%rsi), %ecx
+ movzx 2(%rdi), %eax
- cmp %dl, %al
- jne LABEL(exit)
- test %al, %al
- jz LABEL(exit)
+ sub %ecx, %eax
+ ret
- shr $8, %eax
- shr $8, %edx
+ .p2align 4
+LABEL(Byte3):
#ifdef USE_AS_STRNCMP
- dec %r14
- jl LABEL(exit)
+ sub $3, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ movzx 3(%rsi), %ecx
+ movzx 3(%rdi), %eax
- cmp %dl, %al
- jne LABEL(exit)
- test %al, %al
- jz LABEL(exit)
+ sub %ecx, %eax
+ ret
- shr $8, %eax
- shr $8, %edx
+ .p2align 4
+LABEL(Byte4):
#ifdef USE_AS_STRNCMP
- dec %r14
- jl LABEL(exit)
+ sub $4, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ movzx 4(%rsi), %ecx
+ movzx 4(%rdi), %eax
- cmp %dl, %al
- jne LABEL(exit)
+ sub %ecx, %eax
+ ret
- .p2align 4,, 15
+ .p2align 4
+LABEL(Byte5):
-LABEL(tailafter):
+#ifdef USE_AS_STRNCMP
+ sub $5, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ movzx 5(%rsi), %ecx
+ movzx 5(%rdi), %eax
-LABEL(exit):
- mov -32 (%rsp), %r15
- mov -24 (%rsp), %rbp
- mov -16 (%rsp), %rbx
+ sub %ecx, %eax
+ ret
- .p2align 4,, 3
+ .p2align 4
+LABEL(Byte6):
-LABEL(exitafter):
#ifdef USE_AS_STRNCMP
- test %r14, %r14
- cmovl %edx, %eax
+ sub $6, %r11
+ jbe LABEL(strcmp_exitz)
#endif
+ movzx 6(%rsi), %ecx
+ movzx 6(%rdi), %eax
- movzx %al, %eax
- movzx %dl, %edx
- sub %eax, %edx
- xchg %edx, %eax
+ sub %ecx, %eax
+ ret
+ .p2align 4
+LABEL(next_8_bytes):
+ add $8, %rdi
+ add $8, %rsi
#ifdef USE_AS_STRNCMP
-LABEL(exitz):
- mov -8 (%rsp), %r14
+ sub $8, %r11
+ jbe LABEL(strcmp_exitz)
#endif
- ret
+ test $0x01, %dh
+ jnz LABEL(Byte0)
+
+ test $0x02, %dh
+ jnz LABEL(Byte1)
+ test $0x04, %dh
+ jnz LABEL(Byte2)
+
+ test $0x08, %dh
+ jnz LABEL(Byte3)
+
+ test $0x10, %dh
+ jnz LABEL(Byte4)
+
+ test $0x20, %dh
+ jnz LABEL(Byte5)
+
+ test $0x40, %dh
+ jnz LABEL(Byte6)
+
+#ifdef USE_AS_STRNCMP
+ sub $7, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ movzx 7(%rsi), %ecx
+ movzx 7(%rdi), %eax
+
+ sub %ecx, %eax
+ ret
+
+ .pushsection .rodata
+ .p2align 4
+LABEL(unaligned_table):
+ .int LABEL(ashr_0) - LABEL(unaligned_table)
+ .int LABEL(ashr_15) - LABEL(unaligned_table)
+ .int LABEL(ashr_14) - LABEL(unaligned_table)
+ .int LABEL(ashr_13) - LABEL(unaligned_table)
+ .int LABEL(ashr_12) - LABEL(unaligned_table)
+ .int LABEL(ashr_11) - LABEL(unaligned_table)
+ .int LABEL(ashr_10) - LABEL(unaligned_table)
+ .int LABEL(ashr_9) - LABEL(unaligned_table)
+ .int LABEL(ashr_8) - LABEL(unaligned_table)
+ .int LABEL(ashr_7) - LABEL(unaligned_table)
+ .int LABEL(ashr_6) - LABEL(unaligned_table)
+ .int LABEL(ashr_5) - LABEL(unaligned_table)
+ .int LABEL(ashr_4) - LABEL(unaligned_table)
+ .int LABEL(ashr_3) - LABEL(unaligned_table)
+ .int LABEL(ashr_2) - LABEL(unaligned_table)
+ .int LABEL(ashr_1) - LABEL(unaligned_table)
+ .popsection
#ifdef USE_AS_STRNCMP
SET_SIZE(strncmp)
#else
diff --git a/usr/src/lib/libc/amd64/gen/strcpy.s b/usr/src/lib/libc/amd64/gen/strcpy.s
index f4de3ab1f1..080fe913ae 100644
--- a/usr/src/lib/libc/amd64/gen/strcpy.s
+++ b/usr/src/lib/libc/amd64/gen/strcpy.s
@@ -1,862 +1,2582 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
*/
/*
- * Copyright (c) 2002 Advanced Micro Devices, Inc.
- *
+ * Copyright (c) 2009, Intel Corporation
* All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the
- * following conditions are met:
- *
- * + Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the
- * following disclaimer.
- *
- * + Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the
- * following disclaimer in the documentation and/or other
- * materials provided with the distribution.
- *
- * + Neither the name of Advanced Micro Devices, Inc. nor the
- * names of its contributors may be used to endorse or
- * promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
- * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES,
- * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
- * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * It is licensee's responsibility to comply with any export
- * regulations applicable in licensee's jurisdiction.
*/
- .file "strcpy.s"
-
+/*
+ * str[n]cpy - copy [n] chars from second operand into first operand
+ */
#include "SYS.h"
-#include "cache.h"
+#include "proc64_id.h"
#define LABEL(s) .strcpy/**/s
#ifdef USE_AS_STRNCPY
ENTRY(strncpy)
+ test %edx, %edx
+ jz LABEL(strncpy_exitz)
+ mov %rdx, %r8
#else
- ENTRY(strcpy) /* (char *, const char *) */
+ ENTRY(strcpy) /* (char *, const char *) */
+ xor %rdx, %rdx
+#endif
+ mov %esi, %ecx
+ and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
+ and $0xf, %rcx
+ mov %rdi, %rax /* save destination address for return value */
+
+
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for null */
+ pmovmskb %xmm0, %edx
+ shr %cl, %edx /* adjust for offset from 16byte boundary */
+ test %edx, %edx /* edx will be 0 if chars are non-null */
+ jnz LABEL(less16bytes) /* null char found in first 16 bytes examined */
+#ifdef USE_AS_STRNCPY
+ /*
+ * Check if the count is satisfied in first 16 bytes examined.
+ */
+ lea -16(%r8, %rcx), %r11
+ cmp $0, %r11
+ jle LABEL(less16bytes)
+#endif
+ mov %rcx, %r9 /* rsi alignment offset */
+ or %edi, %ecx
+ and $0xf, %ecx
+ lea -16(%r9), %r10
+ jz LABEL(ashr_0) /* src and dest are both 16 byte aligned */
+
+ neg %r10 /* max src bytes remaining in current dqword */
+
+ pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation */
+ pcmpeqb 16(%rsi), %xmm0 /* check next 16 bytes in src for a null */
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(less32bytes) /* null char found in first 32 bytes examined */
+
+#ifdef USE_AS_STRNCPY
+ /*
+ * If strncpy count <= 16 go to exit case
+ */
+ sub $16, %r8
+ jbe LABEL(less32bytes_strncpy_truncation)
+#endif
+ /*
+ * At least 16 bytes to copy to destination string. Move them now.
+ * Don't worry about alignment.
+ */
+ mov (%rsi, %r9), %rdx
+ mov %rdx, (%rdi)
+ mov 8(%rsi, %r9), %rdx
+ mov %rdx, 8(%rdi)
+
+ /*
+ * so far destination rdi may be aligned by 16, re-calculate rsi and
+ * jump to corresponding src/dest relative offset case.
+ * rcx is offset of rsi
+ * rdx is offset of rdi
+ */
+ and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */
+ mov %rax, %rdx /* rax contains orignal rdi */
+ xor %rdi, %rdx /* same effect as "and $0xf, %rdx" */
+#ifdef USE_AS_STRNCPY
+ /*
+ * Will now do 16 byte aligned stores. Stores may overlap some bytes
+ * (ie store twice) if destination was unaligned. Compensate here.
+ */
+ add %rdx, %r8 /* compensate for overlap */
+#endif
+
+ add $16, %rdi /* next 16 bytes for dest */
+
+ /*
+ * align src to 16-byte boundary. Could be up or down depending on
+ * whether src offset - dest offset > 0 (up) or
+ * src offset - dest offset < 0 (down).
+ */
+ sub %rdx, %r9 /* src offset - dest offset */
+
+ lea 16(%r9, %rsi), %rsi
+ mov %esi, %ecx /* for new src offset */
+ and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
+
+ and $0xf, %ecx /* new src offset is 0 if rsi/rdi have same alignment */
+ jz LABEL(ashr_0)
+
+#ifdef USE_AS_STRNCPY
+ xor %edx, %edx /* In case unaligned_exit is taken */
+#endif
+ /*
+ * Jump to case corresponding to source/dest string relative offsets
+ * Index = (16 + (src offset - dest offset)) % 16
+ */
+ lea -16(%rcx), %r10
+ mov %rcx, %r9
+ neg %r10 /* max src bytes remaining in current dqword */
+ lea LABEL(unaligned_table)(%rip), %r11
+ movslq (%r11, %rcx, 4), %rcx
+ lea (%r11, %rcx), %rcx
+ jmp *%rcx
+
+/*
+ * ashr_0 handles the following cases:
+ * src alignment offset = dest alignment offset
+ */
+ .p2align 5
+LABEL(ashr_0):
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
#endif
+ movdqa (%rsi), %xmm1 /* fetch 16 bytes from src string */
+ movdqa %xmm1, (%rdi) /* store 16 bytes into dest string */
+ add $16, %rsi
+ add $16, %rdi
+ pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for a null */
+ pmovmskb %xmm0, %edx
+
+ test %edx, %edx /* edx will be 0 if chars are non-null */
+ jnz LABEL(aligned_16bytes) /* exit tail */
+
+LABEL(ashr_0_loop):
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
+#endif
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa %xmm1, (%rdi, %rcx)
+ add $16, %rcx
+ pcmpeqb (%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(aligned_exit)
#ifdef USE_AS_STRNCPY
- test %rdx, %rdx /* (char *, const char *, size_t) */
- mov %rdx, %r11
- jz LABEL(exitn) /* early exit */
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
#endif
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa %xmm1, (%rdi, %rcx)
+ add $16, %rcx
+ pcmpeqb (%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(aligned_exit)
- xor %edx, %edx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
+#endif
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa %xmm1, (%rdi, %rcx)
-LABEL(aligntry):
- mov %rsi, %r8 /* align by source */
- and $7, %r8
- jz LABEL(alignafter)
+ add $16, %rcx
+ pcmpeqb (%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(aligned_exit)
-LABEL(align): /* 8-byte align */
- sub $8, %r8
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
+#endif
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa %xmm1, (%rdi, %rcx)
+ add $16, %rcx
+ pcmpeqb (%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jz LABEL(ashr_0_loop)
+ jmp LABEL(aligned_exit)
+
+
+/*
+ * ashr_15 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 15
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_15):
+ xor %ecx, %ecx /* clear index */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_15_use_sse2)
.p2align 4
+LABEL(ashr_15_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ #palignr $15, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0f
-LABEL(alignloop):
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- dec %r11
- jl LABEL(exitn)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov (%rsi, %rdx), %al /* check if same character */
- test %al, %al /* check if character a NUL */
- mov %al, (%rdi, %rdx)
- jz LABEL(exit)
+ #palignr $15, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0f
- inc %edx
- inc %r8
- jnz LABEL(alignloop)
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- test %r11, %r11 /* must check remaining size */
- jz LABEL(exitn) /* If we've already done, exit */
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ jmp LABEL(ashr_15_use_ssse3)
.p2align 4
+LABEL(ashr_15_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
-LABEL(alignafter):
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
-LABEL(8try):
- mov $0xfefefefefefefeff, %rcx
+ psrldq $15, %xmm2
+ pslldq $1, %xmm3
+ por %xmm2, %xmm3
-LABEL(8): /* 8-byte */
- mov (%rsi, %rdx), %rax
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
-LABEL(8loop):
+ psrldq $15, %xmm2
+ pslldq $1, %xmm3
+ por %xmm2, %xmm3
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ jmp LABEL(ashr_15_use_sse2)
+
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+/*
+ * ashr_14 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 14
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_14):
+ xor %ecx, %ecx /* clear index */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_14_use_sse2)
+
+ .p2align 4
+LABEL(ashr_14_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ #palignr $14, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0e
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ #palignr $14, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0e
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_14_use_ssse3)
+
+ .p2align 4
+LABEL(ashr_14_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $14, %xmm2
+ pslldq $2, %xmm3
+ por %xmm2, %xmm3
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $14, %xmm2
+ pslldq $2, %xmm3
+ por %xmm2, %xmm3
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_14_use_sse2)
+
+/*
+ * ashr_13 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 13
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_13):
+ xor %ecx, %ecx /* clear index */
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_13_use_sse2)
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ .p2align 4
+LABEL(ashr_13_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ #palignr $13, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0d
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ #palignr $13, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0d
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_13_use_ssse3)
+
+ .p2align 4
+LABEL(ashr_13_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $13, %xmm2
+ pslldq $3, %xmm3
+ por %xmm2, %xmm3
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $13, %xmm2
+ pslldq $3, %xmm3
+ por %xmm2, %xmm3
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_13_use_sse2)
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+/*
+ * ashr_12 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 12
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_12):
+ xor %ecx, %ecx /* clear index */
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_12_use_sse2)
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ .p2align 4
+LABEL(ashr_12_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ #palignr $12, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0c
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ #palignr $12, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0c
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_12_use_ssse3)
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ .p2align 4
+LABEL(ashr_12_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
+
+ psrldq $12, %xmm2
+ pslldq $4, %xmm3
+ por %xmm2, %xmm3
-LABEL(8after):
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
-LABEL(64try):
- mov _sref_(.amd64cache1half), %r9
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
-LABEL(64): /* 64-byte */
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- .p2align 4
+ psrldq $12, %xmm2
+ pslldq $4, %xmm3
+ por %xmm2, %xmm3
-LABEL(64loop):
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ jmp LABEL(ashr_12_use_sse2)
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+/*
+ * ashr_11 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 11
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_11):
+ xor %ecx, %ecx /* clear index */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_11_use_sse2)
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ .p2align 4
+LABEL(ashr_11_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ #palignr $11, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0b
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+ #palignr $11, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0b
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_11_use_ssse3)
+
+ .p2align 4
+LABEL(ashr_11_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $11, %xmm2
+ pslldq $5, %xmm3
+ por %xmm2, %xmm3
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $11, %xmm2
+ pslldq $5, %xmm3
+ por %xmm2, %xmm3
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_11_use_sse2)
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+/*
+ * ashr_10 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 10
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_10):
+ xor %ecx, %ecx /* clear index */
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_10_use_sse2)
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ .p2align 4
+LABEL(ashr_10_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ #palignr $10, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0a
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ #palignr $10, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x0a
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_10_use_ssse3)
+ .p2align 4
+LABEL(ashr_10_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $10, %xmm2
+ pslldq $6, %xmm3
+ por %xmm2, %xmm3
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
+
+ psrldq $10, %xmm2
+ pslldq $6, %xmm3
+ por %xmm2, %xmm3
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_10_use_sse2)
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+/*
+ * ashr_9 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 9
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_9):
+ xor %ecx, %ecx /* clear index */
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_9_use_sse2)
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ .p2align 4
+LABEL(ashr_9_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ #palignr $9, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x09
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ #palignr $9, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x09
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_9_use_ssse3)
- cmp %r9, %rdx
+ .p2align 4
+LABEL(ashr_9_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- lea 8 (%rdx), %rdx
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- jbe LABEL(64loop)
+ psrldq $9, %xmm2
+ pslldq $7, %xmm3
+ por %xmm2, %xmm3
-LABEL(64after):
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
-LABEL(pretry):
- mov _sref_(.amd64cache2half), %r9
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
-LABEL(pre): /* 64-byte prefetch */
+ psrldq $9, %xmm2
+ pslldq $7, %xmm3
+ por %xmm2, %xmm3
- .p2align 4
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_9_use_sse2)
+
+
+/*
+ * ashr_8 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 8
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_8):
+ xor %ecx, %ecx /* clear index */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_8_use_sse2)
-LABEL(preloop):
+ .p2align 4
+LABEL(ashr_8_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ #palignr $8, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x08
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ #palignr $8, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x08
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_8_use_ssse3)
+ .p2align 4
+LABEL(ashr_8_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $8, %xmm2
+ pslldq $8, %xmm3
+ por %xmm2, %xmm3
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $8, %xmm2
+ pslldq $8, %xmm3
+ por %xmm2, %xmm3
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_8_use_sse2)
+
+/*
+ * ashr_7 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 7
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_7):
+ xor %ecx, %ecx /* clear index */
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_7_use_sse2)
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ .p2align 4
+LABEL(ashr_7_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ #palignr $7, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x07
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ #palignr $7, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x07
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_7_use_ssse3)
+ .p2align 4
+LABEL(ashr_7_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $7, %xmm2
+ pslldq $9, %xmm3
+ por %xmm2, %xmm3
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ psrldq $7, %xmm2
+ pslldq $9, %xmm3
+ por %xmm2, %xmm3
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_7_use_sse2)
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+/*
+ * ashr_6 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 6
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_6):
+ xor %ecx, %ecx /* clear index */
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_6_use_sse2)
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ .p2align 4
+LABEL(ashr_6_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ #palignr $6, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x06
- mov %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %edx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ #palignr $6, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x06
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(tail)
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_6_use_ssse3)
- cmp %r9, %rdx
+ .p2align 4
+LABEL(ashr_6_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- mov %rax, (%rdi, %rdx)
- prefetchnta 512 + 8 (%rdi, %rdx) /* 3DNow: use prefetchw */
- mov 8 (%rsi, %rdx), %rax
- prefetchnta 512 + 8 (%rsi, %rdx) /* 3DNow: use prefetch */
- lea 8 (%rdx), %rdx
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- jb LABEL(preloop)
+ psrldq $6, %xmm2
+ pslldq $10, %xmm3
+ por %xmm2, %xmm3
- .p2align 4
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
-LABEL(preafter):
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
-LABEL(NTtry):
- mfence
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
-LABEL(NT): /* 64-byte NT */
+ psrldq $6, %xmm2
+ pslldq $10, %xmm3
+ por %xmm2, %xmm3
- .p2align 4
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_6_use_sse2)
+
+
+/*
+ * ashr_5 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 5
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_5):
+ xor %ecx, %ecx /* clear index */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_5_use_sse2)
-LABEL(NTloop):
+ .p2align 4
+LABEL(ashr_5_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ #palignr $5, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x05
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(NTtail)
+ #palignr $5, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x05
- movnti %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %rdx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_5_use_ssse3)
+ .p2align 4
+LABEL(ashr_5_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(NTtail)
+ psrldq $5, %xmm2
+ pslldq $11, %xmm3
+ por %xmm2, %xmm3
- movnti %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %rdx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(NTtail)
+ psrldq $5, %xmm2
+ pslldq $11, %xmm3
+ por %xmm2, %xmm3
- movnti %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %rdx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_5_use_sse2)
+
+/*
+ * ashr_4 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 4
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_4):
+ xor %ecx, %ecx /* clear index */
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_4_use_sse2)
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ .p2align 4
+LABEL(ashr_4_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(NTtail)
+ #palignr $4, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x04
- movnti %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %rdx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ #palignr $4, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x04
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(NTtail)
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_4_use_ssse3)
- movnti %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %rdx
+ .p2align 4
+LABEL(ashr_4_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
+
+ psrldq $4, %xmm2
+ pslldq $12, %xmm3
+ por %xmm2, %xmm3
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
+
+ psrldq $4, %xmm2
+ pslldq $12, %xmm3
+ por %xmm2, %xmm3
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(NTtail)
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_4_use_sse2)
- movnti %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %rdx
+/*
+ * ashr_3 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 3
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_3):
+ xor %ecx, %ecx /* clear index */
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_3_use_sse2)
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ .p2align 4
+LABEL(ashr_3_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(NTtail)
+ #palignr $3, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x03
- movnti %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- add $8, %rdx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- sub $8, %r11
- jle LABEL(tail)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- mov %rcx, %r8
- add %rax, %r8
- sbb %r10, %r10
+ #palignr $3, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x03
- xor %rax, %r8
- or %rcx, %r8
- sub %r10, %r8
- jnz LABEL(NTtail)
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_3_use_ssse3)
- movnti %rax, (%rdi, %rdx)
- mov 8 (%rsi, %rdx), %rax
- prefetchnta 768 + 8 (%rsi, %rdx)
- add $8, %rdx
+ .p2align 4
+LABEL(ashr_3_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- jmp LABEL(NTloop)
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- .p2align 4
+ psrldq $3, %xmm2
+ pslldq $13, %xmm3
+ por %xmm2, %xmm3
-LABEL(NTtail):
- mfence
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
- .p2align 4
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
-LABEL(NTafter):
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
-LABEL(tailtry):
+ psrldq $3, %xmm2
+ pslldq $13, %xmm3
+ por %xmm2, %xmm3
-LABEL(tail): /* 1-byte tail */
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- add $8, %r11
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ jmp LABEL(ashr_3_use_sse2)
- .p2align 4
-LABEL(tailloop):
+/*
+ * ashr_2 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 2
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_2):
+ xor %ecx, %ecx /* clear index */
#ifdef USE_AS_STRNCPY
- dec %r11
- jl LABEL(exitn)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_2_use_sse2)
- test %al, %al
- mov %al, (%rdi, %rdx)
- jz LABEL(exit)
+ .p2align 4
+LABEL(ashr_2_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- inc %rdx
+ #palignr $2, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x02
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ #palignr $2, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x02
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- dec %r11
- jl LABEL(exitn)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_2_use_ssse3)
- mov %ah, %al
+ .p2align 4
+LABEL(ashr_2_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- test %ah, %ah
- mov %ah, (%rdi, %rdx)
- jz LABEL(exit)
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- inc %rdx
+ psrldq $2, %xmm2
+ pslldq $14, %xmm3
+ por %xmm2, %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
- dec %r11
- jl LABEL(exitn)
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
- shr $16, %rax
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
+
+ psrldq $2, %xmm2
+ pslldq $14, %xmm3
+ por %xmm2, %xmm3
- test %al, %al
- mov %al, (%rdi, %rdx)
- jz LABEL(exit)
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_2_use_sse2)
- inc %rdx
+/*
+ * ashr_1 handles the following cases:
+ * (16 + (src offset - dest offset)) % 16 = 1
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+ .p2align 4
+LABEL(ashr_1):
+ xor %ecx, %ecx /* clear index */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
+ jz LABEL(ashr_1_use_sse2)
+
+ .p2align 4
+LABEL(ashr_1_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ #palignr $1, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x01
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+ #palignr $1, (%rsi, %rcx), %xmm3
+ .byte 0x66, 0x0F, 0x3A ,0x0F
+ .byte 0x1c, 0x0e, 0x01
+
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- dec %r11
- jl LABEL(exitn)
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_1_use_ssse3)
- mov %ah, %al
+ .p2align 4
+LABEL(ashr_1_use_sse2):
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
#endif
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
- test %ah, %ah
- mov %ah, (%rdi, %rdx)
- jz LABEL(exit)
+ psrldq $1, %xmm2
+ pslldq $15, %xmm3
+ por %xmm2, %xmm3
- shr $16, %rax
- inc %rdx
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
- jmp LABEL(tailloop)
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ pcmpeqb 16(%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
- .p2align 4
+ movdqa 16(%rsi, %rcx), %xmm3
+ movdqa (%rsi, %rcx), %xmm2
-LABEL(tailafter):
+ psrldq $1, %xmm2
+ pslldq $15, %xmm3
+ por %xmm2, %xmm3
-LABEL(exit):
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
#ifdef USE_AS_STRNCPY
- test %r11, %r11
- mov %r11, %rcx
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_1_use_sse2)
-#ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %r8
-#else
- mov %rdi, %r8
+
+ /*
+ * Exit tail code:
+ * Up to 32 bytes are copied in the case of strcpy.
+ */
+ .p2align 4
+LABEL(less32bytes):
+ xor %ecx, %ecx
+LABEL(unaligned_exit):
+ add %r9, %rsi /* r9 holds offset of rsi */
+ mov %rcx, %r9
+ mov %r10, %rcx
+ shl %cl, %edx /* after shl, calculate the exact number to be filled */
+ mov %r9, %rcx
+ .p2align 4
+LABEL(aligned_exit):
+ add %rcx, %rdi /* locate exact address for rdi */
+LABEL(less16bytes):
+ add %rcx, %rsi /* locate exact address for rsi */
+LABEL(aligned_16bytes):
+#ifdef USE_AS_STRNCPY
+ /*
+ * Null found in 16bytes checked. Set bit in bitmask corresponding to
+ * the strncpy count argument. We will copy to the null (inclusive)
+ * or count whichever comes first.
+ */
+ mov $1, %r9d
+ lea -1(%r8), %rcx
+ shl %cl, %r9d
+ cmp $32, %r8
+ ja LABEL(strncpy_tail)
+ or %r9d, %edx
+LABEL(strncpy_tail):
#endif
+ /*
+ * Check to see if BSF is fast on this processor. If not, use a
+ * different exit tail.
+ */
+ testb $USE_BSF, .memops_method(%rip)
+ jz LABEL(AMD_exit)
+ bsf %rdx, %rcx /* Find byte with null char */
+ lea LABEL(tail_table)(%rip), %r11
+ movslq (%r11, %rcx, 4), %rcx
+ lea (%r11, %rcx), %rcx
+ jmp *%rcx
- jz 2f
+#ifdef USE_AS_STRNCPY
+ /*
+ * Count reached before null found.
+ */
+ .p2align 4
+LABEL(less32bytes_strncpy_truncation):
+ xor %ecx, %ecx
+LABEL(strncpy_truncation_unaligned):
+ add %r9, %rsi /* next src char to copy */
+LABEL(strncpy_truncation_aligned):
+ add %rcx, %rdi
+ add %rcx, %rsi
+ add $16, %r8 /* compensation */
+ lea -1(%r8), %rcx
+ lea LABEL(tail_table)(%rip), %r11
+ movslq (%r11, %rcx, 4), %rcx
+ lea (%r11, %rcx), %rcx
+ jmp *%rcx
- xor %eax, %eax /* bzero () would do too, but usually there are only a handfull of bytes left */
- shr $3, %rcx
- lea 1 (%rdi, %rdx), %rdi
- jz 1f
+ .p2align 4
+LABEL(strncpy_exitz):
+ mov %rdi, %rax
+ ret
+#endif
- rep stosq
+ .p2align 4
+LABEL(AMD_exit):
+ test %dl, %dl
+ jz LABEL(AMD_exit_more_8)
+ test $0x01, %dl
+ jnz LABEL(tail_0)
+ test $0x02, %dl
+ jnz LABEL(tail_1)
+ test $0x04, %dl
+ jnz LABEL(tail_2)
+ test $0x08, %dl
+ jnz LABEL(tail_3)
+ test $0x10, %dl
+ jnz LABEL(tail_4)
+ test $0x20, %dl
+ jnz LABEL(tail_5)
+ test $0x40, %dl
+ jnz LABEL(tail_6)
-1:
- mov %r11d, %ecx
- and $7, %ecx
- jz 2f
+ .p2align 4
+LABEL(tail_7): /* 8 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $8, %cl
+ sub $8, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
- .p2align 4,, 3
+#ifdef USE_AS_STRNCPY
+ /*
+ * Null terminated src string shorter than count. Fill the rest of the
+ * destination with null chars.
+ */
+ .p2align 4
+LABEL(strncpy_fill_tail):
+ mov %rax, %rdx
+ movzx %cl, %rax
+ mov %r8, %rcx
+ add %rax, %rdi
+ xor %eax, %eax
+ shr $3, %ecx
+ jz LABEL(strncpy_fill_less_8)
-3:
- dec %ecx
+ rep stosq
+LABEL(strncpy_fill_less_8):
+ mov %r8, %rcx
+ and $7, %rcx
+ jz LABEL(strncpy_fill_return)
+LABEL(strncpy_fill_less_7):
+ sub $1, %ecx
mov %al, (%rdi, %rcx)
- jnz 3b
+ jnz LABEL(strncpy_fill_less_7)
+LABEL(strncpy_fill_return):
+ mov %rdx, %rax
+ ret
+#endif
- .p2align 4,, 3
+ .p2align 4
+LABEL(tail_0): /* 1 byte */
+ mov (%rsi), %cl
+ mov %cl, (%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $1, %cl
+ sub $1, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
-2:
- mov %r8, %rax
- ret
+ .p2align 4
+LABEL(tail_1): /* 2 bytes */
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $2, %cl
+ sub $2, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_2): /* 3 bytes */
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+ mov 1(%rsi), %cx
+ mov %cx, 1(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $3, %cl
+ sub $3, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+ .p2align 4
+LABEL(tail_3): /* 4 bytes */
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $4, %cl
+ sub $4, %r8
+ jnz LABEL(strncpy_fill_tail)
#endif
+ ret
- .p2align 4
+ .p2align 4
+LABEL(tail_4): /* 5 bytes */
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov 1(%rsi), %edx
+ mov %edx, 1(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $5, %cl
+ sub $5, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
-LABEL(exitn):
-#ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-#else
- mov %rdi, %rax
+ .p2align 4
+LABEL(tail_5): /* 6 bytes */
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov 2(%rsi), %edx
+ mov %edx, 2(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $6, %cl
+ sub $6, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_6): /* 7 bytes */
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov 3(%rsi), %edx
+ mov %edx,3(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $7, %cl
+ sub $7, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_8): /* 9 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 5(%rsi), %edx
+ mov %edx, 5(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $9, %cl
+ sub $9, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(AMD_exit_more_8):
+ test %dh, %dh
+ jz LABEL(AMD_exit_more_16)
+ test $0x01, %dh
+ jnz LABEL(tail_8)
+ test $0x02, %dh
+ jnz LABEL(tail_9)
+ test $0x04, %dh
+ jnz LABEL(tail_10)
+ test $0x08, %dh
+ jnz LABEL(tail_11)
+ test $0x10, %dh
+ jnz LABEL(tail_12)
+ test $0x20, %dh
+ jnz LABEL(tail_13)
+ test $0x40, %dh
+ jnz LABEL(tail_14)
+
+ .p2align 4
+LABEL(tail_15): /* 16 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $16, %cl
+ sub $16, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_9): /* 10 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 6(%rsi), %edx
+ mov %edx, 6(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $10, %cl
+ sub $10, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_10): /* 11 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 7(%rsi), %edx
+ mov %edx, 7(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $11, %cl
+ sub $11, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_11): /* 12 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %edx
+ mov %edx, 8(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $12, %cl
+ sub $12, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_12): /* 13 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 5(%rsi), %rcx
+ mov %rcx, 5(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $13, %cl
+ sub $13, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_13): /* 14 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 6(%rsi), %rcx
+ mov %rcx, 6(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $14, %cl
+ sub $14, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_14): /* 15 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 7(%rsi), %rcx
+ mov %rcx, 7(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $15, %cl
+ sub $15, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(AMD_exit_more_16):
+ shr $16, %edx
+ test %dl, %dl
+ jz LABEL(AMD_exit_more_24)
+ test $0x01, %dl
+ jnz LABEL(tail_16)
+ test $0x02, %dl
+ jnz LABEL(tail_17)
+ test $0x04, %dl
+ jnz LABEL(tail_18)
+ test $0x08, %dl
+ jnz LABEL(tail_19)
+ test $0x10, %dl
+ jnz LABEL(tail_20)
+ test $0x20, %dl
+ jnz LABEL(tail_21)
+ test $0x40, %dl
+ jnz LABEL(tail_22)
+
+ .p2align 4
+LABEL(tail_23): /* 24 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $24, %cl
+ sub $24, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_16): /* 17 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %cl
+ mov %cl, 16(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $17, %cl
+ sub $17, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_17): /* 18 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %cx
+ mov %cx, 16(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $18, %cl
+ sub $18, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_18): /* 19 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 15(%rsi), %ecx
+ mov %ecx,15(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $19, %cl
+ sub $19, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_19): /* 20 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %ecx
+ mov %ecx, 16(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $20, %cl
+ sub $20, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_20): /* 21 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 13(%rsi), %rcx
+ mov %rcx, 13(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $21, %cl
+ sub $21, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_21): /* 22 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 14(%rsi), %rcx
+ mov %rcx, 14(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $22, %cl
+ sub $22, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_22): /* 23 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 15(%rsi), %rcx
+ mov %rcx, 15(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $23, %cl
+ sub $23, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(AMD_exit_more_24):
+ test $0x01, %dh
+ jnz LABEL(tail_24)
+ test $0x02, %dh
+ jnz LABEL(tail_25)
+ test $0x04, %dh
+ jnz LABEL(tail_26)
+ test $0x08, %dh
+ jnz LABEL(tail_27)
+ test $0x10, %dh
+ jnz LABEL(tail_28)
+ test $0x20, %dh
+ jnz LABEL(tail_29)
+ test $0x40, %dh
+ jnz LABEL(tail_30)
+
+ .p2align 4
+LABEL(tail_31): /* 32 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 24(%rsi), %rdx
+ mov %rdx, 24(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $32, %cl
+ sub $32, %r8
+ jnz LABEL(strncpy_fill_tail)
#endif
+ ret
- ret
+ .p2align 4
+LABEL(tail_24): /* 25 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 21(%rsi), %edx
+ mov %edx, 21(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $25, %cl
+ sub $25, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_25): /* 26 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 22(%rsi), %edx
+ mov %edx, 22(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $26, %cl
+ sub $26, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_26): /* 27 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 23(%rsi), %edx
+ mov %edx, 23(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $27, %cl
+ sub $27, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_27): /* 28 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 24(%rsi), %edx
+ mov %edx, 24(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $28, %cl
+ sub $28, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_28): /* 29 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 21(%rsi), %rdx
+ mov %rdx, 21(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $29, %cl
+ sub $29, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_29): /* 30 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 22(%rsi), %rdx
+ mov %rdx, 22(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $30, %cl
+ sub $30, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_30): /* 31 bytes */
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 23(%rsi), %rdx
+ mov %rdx, 23(%rdi)
+#ifdef USE_AS_STRNCPY
+ mov $31, %cl
+ sub $31, %r8
+ jnz LABEL(strncpy_fill_tail)
+#endif
+ ret
+
+ .pushsection .rodata
+ .p2align 4
+LABEL(tail_table):
+ .int LABEL(tail_0) - LABEL(tail_table) /* 1 byte */
+ .int LABEL(tail_1) - LABEL(tail_table)
+ .int LABEL(tail_2) - LABEL(tail_table)
+ .int LABEL(tail_3) - LABEL(tail_table)
+ .int LABEL(tail_4) - LABEL(tail_table)
+ .int LABEL(tail_5) - LABEL(tail_table)
+ .int LABEL(tail_6) - LABEL(tail_table)
+ .int LABEL(tail_7) - LABEL(tail_table)
+ .int LABEL(tail_8) - LABEL(tail_table)
+ .int LABEL(tail_9) - LABEL(tail_table)
+ .int LABEL(tail_10) - LABEL(tail_table)
+ .int LABEL(tail_11) - LABEL(tail_table)
+ .int LABEL(tail_12) - LABEL(tail_table)
+ .int LABEL(tail_13) - LABEL(tail_table)
+ .int LABEL(tail_14) - LABEL(tail_table)
+ .int LABEL(tail_15) - LABEL(tail_table)
+ .int LABEL(tail_16) - LABEL(tail_table)
+ .int LABEL(tail_17) - LABEL(tail_table)
+ .int LABEL(tail_18) - LABEL(tail_table)
+ .int LABEL(tail_19) - LABEL(tail_table)
+ .int LABEL(tail_20) - LABEL(tail_table)
+ .int LABEL(tail_21) - LABEL(tail_table)
+ .int LABEL(tail_22) - LABEL(tail_table)
+ .int LABEL(tail_23) - LABEL(tail_table)
+ .int LABEL(tail_24) - LABEL(tail_table)
+ .int LABEL(tail_25) - LABEL(tail_table)
+ .int LABEL(tail_26) - LABEL(tail_table)
+ .int LABEL(tail_27) - LABEL(tail_table)
+ .int LABEL(tail_28) - LABEL(tail_table)
+ .int LABEL(tail_29) - LABEL(tail_table)
+ .int LABEL(tail_30) - LABEL(tail_table)
+ .int LABEL(tail_31) - LABEL(tail_table) /* 32 bytes */
+
+ .p2align 4
+LABEL(unaligned_table):
+ .int LABEL(ashr_0) - LABEL(unaligned_table)
+ .int LABEL(ashr_1) - LABEL(unaligned_table)
+ .int LABEL(ashr_2) - LABEL(unaligned_table)
+ .int LABEL(ashr_3) - LABEL(unaligned_table)
+ .int LABEL(ashr_4) - LABEL(unaligned_table)
+ .int LABEL(ashr_5) - LABEL(unaligned_table)
+ .int LABEL(ashr_6) - LABEL(unaligned_table)
+ .int LABEL(ashr_7) - LABEL(unaligned_table)
+ .int LABEL(ashr_8) - LABEL(unaligned_table)
+ .int LABEL(ashr_9) - LABEL(unaligned_table)
+ .int LABEL(ashr_10) - LABEL(unaligned_table)
+ .int LABEL(ashr_11) - LABEL(unaligned_table)
+ .int LABEL(ashr_12) - LABEL(unaligned_table)
+ .int LABEL(ashr_13) - LABEL(unaligned_table)
+ .int LABEL(ashr_14) - LABEL(unaligned_table)
+ .int LABEL(ashr_15) - LABEL(unaligned_table)
+ .popsection
#ifdef USE_AS_STRNCPY
SET_SIZE(strncpy)
#else
- SET_SIZE(strcpy) /* (char *, const char *) */
+ SET_SIZE(strcpy) /* (char *, const char *) */
#endif
diff --git a/usr/src/lib/libc/amd64/gen/strlen.s b/usr/src/lib/libc/amd64/gen/strlen.s
index e33009d3e1..3b41235678 100644
--- a/usr/src/lib/libc/amd64/gen/strlen.s
+++ b/usr/src/lib/libc/amd64/gen/strlen.s
@@ -1,430 +1,199 @@
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
*/
-
+
/*
- * Copyright (c) 2002 Advanced Micro Devices, Inc.
- *
+ * Copyright (c) 2009, Intel Corporation
* All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the
- * following conditions are met:
- *
- * + Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the
- * following disclaimer.
- *
- * + Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the
- * following disclaimer in the documentation and/or other
- * materials provided with the distribution.
- *
- * + Neither the name of Advanced Micro Devices, Inc. nor the
- * names of its contributors may be used to endorse or
- * promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
- * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES,
- * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
- * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * It is licensee's responsibility to comply with any export
- * regulations applicable in licensee's jurisdiction.
*/
- .file "strlen.s"
+/*
+ * strlen - calculate the length of string
+ */
#include "SYS.h"
-#include "cache.h"
+#include "proc64_id.h"
#define LABEL(s) .strlen/**/s
- ENTRY(strlen) /* (const char *s) */
-
- mov %rdi, %rsi
- neg %rdi
-
-LABEL(aligntry):
- mov %rsi , %r8
- and $7, %r8d
- jz LABEL(alignafter)
-
-LABEL(align): /* 8-byte align */
- sub $8, %r8
-
- .p2align 4
-
-LABEL(alignloop):
- cmpb $0, (%rsi)
- je LABEL(exit)
-
- inc %rsi
- inc %r8
- jnz LABEL(alignloop)
-
- .p2align 4
-
-LABEL(alignafter):
-
-LABEL(56try):
-
-LABEL(56): /* 56-byte */
- mov (%rsi), %rax
- mov $0xfefefefefefefeff, %rcx
-
-LABEL(56loop):
- mov %rcx, %r8
- add %rax, %r8
- jnc LABEL(tail)
-
- xor %rax, %r8
- or %rcx, %r8
- inc %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- lea 8 (%rsi), %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- jnc LABEL(tail)
-
- xor %rax, %r8
- or %rcx, %r8
- inc %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- lea 8 (%rsi), %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- jnc LABEL(tail)
-
- xor %rax, %r8
- or %rcx, %r8
- inc %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- lea 8 (%rsi), %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- jnc LABEL(tail)
-
- xor %rax, %r8
- or %rcx, %r8
- inc %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- lea 8 (%rsi), %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- jnc LABEL(tail)
-
- xor %rax, %r8
- or %rcx, %r8
- inc %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- lea 8 (%rsi), %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- jnc LABEL(tail)
-
- xor %rax, %r8
- or %rcx, %r8
- inc %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- lea 8 (%rsi), %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- jnc LABEL(tail)
-
- xor %rax, %r8
- or %rcx, %r8
- inc %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- lea 8 (%rsi), %rsi
-
-LABEL(56after):
-
-LABEL(32): /* 32-byte */
- mov _sref_(.amd64cache1), %r9
-
- .p2align 4
-
-LABEL(32loop):
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- sub $32, %r9
-
- mov 8 (%rsi), %rax
- lea 8 (%rsi), %rsi
-
- jbe LABEL(32loop)
-
-LABEL(32after):
-
-LABEL(pretry):
-
-LABEL(pre): /* 64-byte prefetch */
-
- .p2align 4
-
-LABEL(preloop):
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- mov %rcx, %r8
- add %rax, %r8
- sbb %rdx, %rdx
-
- xor %rax, %r8
- or %rcx, %r8
- sub %rdx, %r8
- jnz LABEL(tail)
-
- prefetchnta 512 (%rsi) /* 3DNow: use prefetch */
-
- mov 8 (%rsi), %rax
- add $8, %rsi
-
- jmp LABEL(preloop)
-
- .p2align 4
-
-LABEL(preafter):
-
-LABEL(tailtry):
-
-LABEL(tail): /* 4-byte tail */
-
-LABEL(tailloop):
- test %al, %al
- jz LABEL(exit)
-
- inc %rsi
-
- test %ah, %ah
- jz LABEL(exit)
-
- inc %rsi
-
- test $0x00ff0000, %eax
- jz LABEL(exit)
-
- inc %rsi
-
- test $0xff000000, %eax
- jz LABEL(exit)
-
- inc %rsi
-
- shr $32, %rax
- jmp LABEL(tailloop)
-
-LABEL(tailafter):
-
- .p2align 4
-
+ /*
+ * This implementation uses SSE instructions to compare up to 16 bytes
+ * at a time looking for the end of string (null char).
+ */
+ ENTRY(strlen) /* (const char *s) */
+ mov %rdi, %rsi /* keep original %rdi value */
+ mov %rsi, %rcx
+ pxor %xmm0, %xmm0 /* 16 null chars */
+ and $15, %rcx
+ jz LABEL(align16_loop) /* string is 16 byte aligned */
+
+ /*
+ * Unaligned case. Round down to 16-byte boundary before comparing
+ * 16 bytes for a null char. The code then compensates for any extra chars
+ * preceding the start of the string.
+ */
+LABEL(unalign16):
+ and $0xfffffffffffffff0, %rsi
+
+ pcmpeqb (%rsi), %xmm0
+ lea 16(%rdi), %rsi
+ pmovmskb %xmm0, %edx
+
+ shr %cl, %edx /* Compensate for bytes preceding the string */
+ test %edx, %edx
+ jnz LABEL(exit)
+ sub %rcx, %rsi /* no null, adjust to next 16-byte boundary */
+ pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */
+
+ .p2align 4
+LABEL(align16_loop): /* 16 byte aligned */
+ pcmpeqb (%rsi), %xmm0 /* look for null bytes */
+ pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */
+
+ add $16, %rsi /* prepare to search next 16 bytes */
+ test %edx, %edx /* if no null byte, %edx must be 0 */
+ jnz LABEL(exit) /* found a null */
+
+ pcmpeqb (%rsi), %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %rsi
+ test %edx, %edx
+ jnz LABEL(exit)
+
+ pcmpeqb (%rsi), %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %rsi
+ test %edx, %edx
+ jnz LABEL(exit)
+
+ pcmpeqb (%rsi), %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %rsi
+ test %edx, %edx
+ jz LABEL(align16_loop)
+
+ .p2align 4
LABEL(exit):
- lea (%rdi, %rsi), %rax
- ret
-
+ neg %rdi
+ /*
+ * Check to see if BSF is fast on this processor. If not, use a different
+ * exit tail to find first bit set indicating null byte match.
+ */
+ testl $USE_BSF, .memops_method(%rip)
+ jz LABEL(AMD_exit)
+
+ lea -16(%rdi, %rsi), %rax /* calculate exact offset */
+ bsf %edx, %ecx /* Least significant 1 bit is index of null */
+ lea (%rax, %rcx),%rax
+ ret
+
+ /*
+ * This exit tail does not use the bsf instruction.
+ */
+ .p2align 4
+LABEL(AMD_exit):
+ lea -16(%rdi, %rsi), %rax
+ test %dl, %dl
+ jz LABEL(exit_high)
+ test $0x01, %dl
+ jnz LABEL(exit_tail0)
+
+ test $0x02, %dl
+ jnz LABEL(exit_tail1)
+
+ .p2align 4
+ test $0x04, %dl
+ jnz LABEL(exit_tail2)
+
+ test $0x08, %dl
+ jnz LABEL(exit_tail3)
+
+ test $0x10, %dl
+ jnz LABEL(exit_tail4)
+
+ test $0x20, %dl
+ jnz LABEL(exit_tail5)
+
+ test $0x40, %dl
+ jnz LABEL(exit_tail6)
+ add $7, %rax
+ ret
+
+ .p2align 4
+LABEL(exit_high):
+ add $8, %rax
+ test $0x01, %dh
+ jnz LABEL(exit_tail0)
+
+ test $0x02, %dh
+ jnz LABEL(exit_tail1)
+
+ test $0x04, %dh
+ jnz LABEL(exit_tail2)
+
+ test $0x08, %dh
+ jnz LABEL(exit_tail3)
+
+ test $0x10, %dh
+ jnz LABEL(exit_tail4)
+
+ test $0x20, %dh
+ jnz LABEL(exit_tail5)
+
+ test $0x40, %dh
+ jnz LABEL(exit_tail6)
+ add $7, %rax
+ ret
+
+ .p2align 4
+LABEL(exit_tail0):
+ xor %ecx, %ecx
+ ret
+
+ .p2align 4
+LABEL(exit_tail1):
+ add $1, %rax
+ ret
+
+ .p2align 4
+LABEL(exit_tail2):
+ add $2, %rax
+ ret
+
+ .p2align 4
+LABEL(exit_tail3):
+ add $3, %rax
+ ret
+
+ .p2align 4
+LABEL(exit_tail4):
+ add $4, %rax
+ ret
+
+ .p2align 4
+LABEL(exit_tail5):
+ add $5, %rax
+ ret
+
+ .p2align 4
+LABEL(exit_tail6):
+ add $6, %rax
+ ret
SET_SIZE(strlen)