summaryrefslogtreecommitdiff
path: root/usr/src/cmd/sgs/rtld
diff options
context:
space:
mode:
authorRobert Mustacchi <rm@joyent.com>2018-06-06 00:54:12 +0000
committerRobert Mustacchi <rm@joyent.com>2018-06-11 22:50:44 +0000
commita33f24fd1b0665ec1cb9f57b5e860fed1d393793 (patch)
treec4b3f5f9d236d020ad3b69120bc770d35842bf58 /usr/src/cmd/sgs/rtld
parent2d24d386afb2debdcfd0eb0aa496b2642ce2e7b7 (diff)
downloadillumos-joyent-a33f24fd1b0665ec1cb9f57b5e860fed1d393793.tar.gz
OS-7003 Initial xsave xstate_bv should not include all features
OS-6917 rtld should conditionally save AVX-512 state Reviewed by: John Levon <john.levon@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Bryan Cantrill <bryan@joyent.com> Approved by: Alex Wilson <alex.wilson@joyent.com>
Diffstat (limited to 'usr/src/cmd/sgs/rtld')
-rw-r--r--usr/src/cmd/sgs/rtld/amd64/_setup.c79
-rw-r--r--usr/src/cmd/sgs/rtld/amd64/boot_elf.s942
2 files changed, 487 insertions, 534 deletions
diff --git a/usr/src/cmd/sgs/rtld/amd64/_setup.c b/usr/src/cmd/sgs/rtld/amd64/_setup.c
index cd47f3b91f..2a92d72565 100644
--- a/usr/src/cmd/sgs/rtld/amd64/_setup.c
+++ b/usr/src/cmd/sgs/rtld/amd64/_setup.c
@@ -24,7 +24,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
*/
/*
@@ -49,6 +49,68 @@
#include "_audit.h"
#include "msg.h"
+/*
+ * Number of bytes to save for register usage.
+ */
+uint_t _plt_save_size;
+void (*_plt_fp_save)(void *);
+void (*_plt_fp_restore)(void *);
+
+extern void _elf_rtbndr_fp_save_orig(void *);
+extern void _elf_rtbndr_fp_restore_orig(void *);
+extern void _elf_rtbndr_fp_fxsave(void *);
+extern void _elf_rtbndr_fp_fxrestore(void *);
+extern void _elf_rtbndr_fp_xsave(void *);
+extern void _elf_rtbndr_fp_xrestore(void *);
+
+/*
+ * Based on what the kernel has told us, go through and set up the various
+ * pointers that we'll need for elf_rtbndr for the FPU.
+ */
+static void
+_setup_plt_fpu(int kind, size_t len)
+{
+ /*
+ * If we didn't get a length for some reason, fall back to the old
+ * implementation.
+ */
+ if (len == 0)
+ kind = -1;
+
+ switch (kind) {
+ case AT_386_FPINFO_FXSAVE:
+ _plt_fp_save = _elf_rtbndr_fp_fxsave;
+ _plt_fp_restore = _elf_rtbndr_fp_fxrestore;
+ _plt_save_size = len;
+ break;
+ /*
+ * We can treat processors that don't correctly handle the exception
+ * information in xsave the same way we do others. The information
+ * that may or may not be properly saved and restored should not be
+ * relevant to us because of the ABI.
+ */
+ case AT_386_FPINFO_XSAVE:
+ case AT_386_FPINFO_XSAVE_AMD:
+ _plt_fp_save = _elf_rtbndr_fp_xsave;
+ _plt_fp_restore = _elf_rtbndr_fp_xrestore;
+ _plt_save_size = len;
+ break;
+ default:
+ _plt_fp_save = _elf_rtbndr_fp_save_orig;
+ _plt_fp_restore = _elf_rtbndr_fp_restore_orig;
+ /*
+ * The ABI says that 8 floating point registers are used for
+ * passing arguments (%xmm0 through %xmm7). Because these
+ * registers on some platforms may shadow the %ymm and %zmm
+ * registers, we end up needing to size this for the maximally
+ * sized register we care about, a 512-bit (64-byte) zmm
+ * register.
+ */
+ _plt_save_size = 64 * 8;
+ break;
+ }
+}
+
/* VARARGS */
unsigned long
_setup(Boot *ebp, Dyn *ld_dyn)
@@ -67,7 +129,8 @@ _setup(Boot *ebp, Dyn *ld_dyn)
uid_t uid = (uid_t)-1, euid = (uid_t)-1;
gid_t gid = (gid_t)-1, egid = (gid_t)-1;
char *_platform = NULL, *_execname = NULL, *_emulator = NULL;
- int auxflags = -1;
+ int auxflags = -1, fpkind = -1;
+ size_t fpsize = 0;
/*
* Scan the bootstrap structure to pick up the basics.
@@ -158,6 +221,12 @@ _setup(Boot *ebp, Dyn *ld_dyn)
/* name of emulation library, if any */
_emulator = auxv->a_un.a_ptr;
break;
+ case AT_SUN_FPTYPE:
+ fpkind = (int)auxv->a_un.a_val;
+ break;
+ case AT_SUN_FPSIZE:
+ fpsize = (size_t)auxv->a_un.a_val;
+ break;
}
}
@@ -231,6 +300,12 @@ _setup(Boot *ebp, Dyn *ld_dyn)
sizeof (ulong_t) + sizeof (Sym);
/*
+ * Initialize the amd64 specific PLT relocation constants based on the
+ * FP information that we have.
+ */
+ _setup_plt_fpu(fpkind, fpsize);
+
+ /*
* Continue with generic startup processing.
*/
if ((lmp = setup((char **)_envp, (auxv_t *)_auxv, _flags, _platform,
diff --git a/usr/src/cmd/sgs/rtld/amd64/boot_elf.s b/usr/src/cmd/sgs/rtld/amd64/boot_elf.s
index 67301a0c96..36f136e31d 100644
--- a/usr/src/cmd/sgs/rtld/amd64/boot_elf.s
+++ b/usr/src/cmd/sgs/rtld/amd64/boot_elf.s
@@ -22,7 +22,86 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright (c) 2017 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Welcome to the magic behind the PLT (procedure linkage table). When rtld
+ * fills out the PLT entries, it will refer initially to the functions in this
+ * file. As such our goal is simple:
+ *
+ * The lie of the function call must be preserved at all costs.
+ *
+ * This means that we need to prepare the system for an arbitrary series of
+ * instructions to be called. For example, as a side effect of resolving a
+ * symbol we may need to open a shared object which will cause any _init
+ * functions to be called. Those functions can use any and all of the ABI state
+ * that they desire (for example, the FPU registers). Therefore we must save and
+ * restore all the ABI mandated registers here.
+ *
+ * For the full information about what we need to save and restore and why,
+ * please see the System V amd64 PS ABI '3.2.3 Parameter Passing'. For general
+ * purpose registers, we need to take care of the following:
+ *
+ * %rax - Used for information about the number of vector arguments
+ * %rdi - arg0
+ * %rsi - arg1
+ * %rdx - arg2
+ * %rcx - arg3
+ * %r8 - arg4
+ * %r9 - arg5
+ * %r10 - static chain pointer
+ *
+ * Unfortunately, the world of the FPU is more complicated.
+ *
+ * The ABI mandates that we must save %xmm0-%xmm7. On newer Intel processors,
+ * %xmm0-%xmm7 shadow %ymm0-%ymm7 and %zmm0-%zmm7. Historically, when saving the
+ * FPU, we only saved and restored these eight registers. Unfortunately, this
+ * process itself ended up having side effects. Because the registers shadow one
+ * another, if we saved a full %zmm register when only a %xmm register was
+ * valid, we would end up causing the processor to think that the full %zmm
+ * register was valid. Once it believed that this was the case, it would then
+ * degrade performance of code that only used the %xmm registers.
+ *
+ * One way to tackle this problem would have been to use xgetbv with ecx=1 to
+ * get information about what was actually in use and only save and restore
+ * that. You can imagine that this logic roughly ends up as something like:
+ *
+ * if (zmm_inuse)
+ * save_zmm()
+ * if (ymm_inuse)
+ * save_ymm()
+ * save_xmm()
+ *
+ * However, this logic leaves us at the mercy of the branch predictor. This
+ * means that all of our efforts can end up still causing the CPU to execute
+ * things to make it think that some of these other FPU registers are in use and
+ * thus defeat the optimizations that it has.
+ *
+ * To deal with this problem, Intel has suggested using the xsave family of
+ * instructions. The kernel provides information about the size required for the
+ * floating point registers as well as which of several methods we need to
+ * employ through the aux vector. This gets us out of trying to look at the
+ * hardware capabilities and make decisions every time. As part of the
+ * amd64-specific portion of rtld, it will process those values and determine
+ * the functions on an as-needed basis.
+ *
+ * There are two different functions that we export. The first is elf_rtbndr().
+ * This is basically the glue that gets us into the PLT and to perform
+ * relocations. elf_rtbndr() determines the address of the function that we must
+ * call and arranges its stack such that when we return from elf_rtbndr() we
+ * will instead jump to the actual relocated function which will return to the
+ * original caller. Because of this, we must preserve all of the registers that
+ * are used for arguments and restore them before returning.
+ *
+ * The second function we export is elf_plt_trace(). This is used to add support
+ * for audit libraries among other things. elf_plt_trace() may or may not call
+ * the underlying function as a side effect or merely set up its return to it.
+ * This changes how we handle %rax. If we call the function ourself, then we end
+ * up making sure that %rax is the return value versus the initial value. In
+ * addition, because we get %r11 from the surrounding PLT code, we opt to
+ * preserve it in case some of the relocation logic ever ends up calling back
+ * into us again.
*/
#if defined(lint)
@@ -34,23 +113,171 @@
#include <sys/regset.h>
#include <sys/auxv_386.h>
-/* ARGSUSED0 */
-int
-elf_plt_trace()
-{
- return (0);
-}
#else
#include <link.h>
#include <_audit.h>
#include <sys/asm_linkage.h>
#include <sys/auxv_386.h>
+#include <sys/x86_archext.h>
+
+/*
+ * This macro is used to zero the xsave header. The contents of scratchreg will
+ * be destroyed. locreg should contain the starting address of the xsave header.
+ */
+#define XSAVE_HEADER_ZERO(scratch, loc) \
+ xorq scratch, scratch; \
+ movq scratch, 0x200(loc); \
+ movq scratch, 0x208(loc); \
+ movq scratch, 0x210(loc); \
+ movq scratch, 0x218(loc); \
+ movq scratch, 0x220(loc); \
+ movq scratch, 0x228(loc); \
+ movq scratch, 0x230(loc); \
+ movq scratch, 0x238(loc)
+
.file "boot_elf.s"
.text
/*
+ * This section of the code contains glue functions that are used to take care
+ * of saving and restoring the FPU. We deal with this in a few different ways
+ * based on the hardware support and what exists. Historically we've only saved
+ * and restored the first 8 floating point registers rather than the entire FPU.
+ * That implementation still exists here and is kept around mostly as an
+ * insurance policy.
+ */
+ ENTRY(_elf_rtbndr_fp_save_orig)
+ movq org_scapset@GOTPCREL(%rip),%r11
+ movq (%r11),%r11 /* Syscapset_t pointer */
+ movl 8(%r11),%edx /* sc_hw_2 */
+ testl $AV_386_2_AVX512F,%edx
+ jne .save_zmm
+ movl (%r11),%edx /* sc_hw_1 */
+ testl $AV_386_AVX,%edx
+ jne .save_ymm
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm1, 64(%rdi)
+ movdqa %xmm2, 128(%rdi)
+ movdqa %xmm3, 192(%rdi)
+ movdqa %xmm4, 256(%rdi)
+ movdqa %xmm5, 320(%rdi)
+ movdqa %xmm6, 384(%rdi)
+ movdqa %xmm7, 448(%rdi)
+ jmp .save_finish
+
+.save_ymm:
+ vmovdqa %ymm0, (%rdi)
+ vmovdqa %ymm1, 64(%rdi)
+ vmovdqa %ymm2, 128(%rdi)
+ vmovdqa %ymm3, 192(%rdi)
+ vmovdqa %ymm4, 256(%rdi)
+ vmovdqa %ymm5, 320(%rdi)
+ vmovdqa %ymm6, 384(%rdi)
+ vmovdqa %ymm7, 448(%rdi)
+ jmp .save_finish
+
+.save_zmm:
+ vmovdqa64 %zmm0, (%rdi)
+ vmovdqa64 %zmm1, 64(%rdi)
+ vmovdqa64 %zmm2, 128(%rdi)
+ vmovdqa64 %zmm3, 192(%rdi)
+ vmovdqa64 %zmm4, 256(%rdi)
+ vmovdqa64 %zmm5, 320(%rdi)
+ vmovdqa64 %zmm6, 384(%rdi)
+ vmovdqa64 %zmm7, 448(%rdi)
+
+.save_finish:
+ ret
+ SET_SIZE(_elf_rtbndr_fp_save_orig)
+
+ ENTRY(_elf_rtbndr_fp_restore_orig)
+ movq org_scapset@GOTPCREL(%rip),%r11
+ movq (%r11),%r11 /* Syscapset_t pointer */
+ movl 8(%r11),%edx /* sc_hw_2 */
+ testl $AV_386_2_AVX512F,%edx
+ jne .restore_zmm
+ movl (%r11),%edx /* sc_hw_1 */
+ testl $AV_386_AVX,%edx
+ jne .restore_ymm
+
+ movdqa (%rdi), %xmm0
+ movdqa 64(%rdi), %xmm1
+ movdqa 128(%rdi), %xmm2
+ movdqa 192(%rdi), %xmm3
+ movdqa 256(%rdi), %xmm4
+ movdqa 320(%rdi), %xmm5
+ movdqa 384(%rdi), %xmm6
+ movdqa 448(%rdi), %xmm7
+ jmp .restore_finish
+
+.restore_ymm:
+ vmovdqa (%rdi), %ymm0
+ vmovdqa 64(%rdi), %ymm1
+ vmovdqa 128(%rdi), %ymm2
+ vmovdqa 192(%rdi), %ymm3
+ vmovdqa 256(%rdi), %ymm4
+ vmovdqa 320(%rdi), %ymm5
+ vmovdqa 384(%rdi), %ymm6
+ vmovdqa 448(%rdi), %ymm7
+ jmp .restore_finish
+
+.restore_zmm:
+ vmovdqa64 (%rdi), %zmm0
+ vmovdqa64 64(%rdi), %zmm1
+ vmovdqa64 128(%rdi), %zmm2
+ vmovdqa64 192(%rdi), %zmm3
+ vmovdqa64 256(%rdi), %zmm4
+ vmovdqa64 320(%rdi), %zmm5
+ vmovdqa64 384(%rdi), %zmm6
+ vmovdqa64 448(%rdi), %zmm7
+
+.restore_finish:
+ ret
+ SET_SIZE(_elf_rtbndr_fp_restore_orig)
+
+ ENTRY(_elf_rtbndr_fp_fxsave)
+ fxsaveq (%rdi)
+ ret
+ SET_SIZE(_elf_rtbndr_fp_fxsave)
+
+ ENTRY(_elf_rtbndr_fp_fxrestore)
+ fxrstor (%rdi)
+ ret
+ SET_SIZE(_elf_rtbndr_fp_fxrestore)
+
+ ENTRY(_elf_rtbndr_fp_xsave)
+ XSAVE_HEADER_ZERO(%rdx, %rdi)
+ movq $_CONST(XFEATURE_FP_ALL), %rdx
+ movl %edx, %eax
+ shrq $32, %rdx
+ xsave (%rdi) /* save data */
+ ret
+ SET_SIZE(_elf_rtbndr_fp_xsave)
+
+ ENTRY(_elf_rtbndr_fp_xrestore)
+ movq $_CONST(XFEATURE_FP_ALL), %rdx
+ movl %edx, %eax
+ shrq $32, %rdx
+ xrstor (%rdi) /* save data */
+ ret
+ SET_SIZE(_elf_rtbndr_fp_xrestore)
+
+#endif
+
+#if defined(lint)
+
+/* ARGSUSED0 */
+int
+elf_plt_trace()
+{
+ return (0);
+}
+
+#else
+
+/*
* On entry the 'glue code' has already done the following:
*
* pushq %rbp
@@ -79,173 +306,80 @@ elf_plt_trace()
* the second 8 bytes are to align the stack and are available
* for use.
*/
-#define REFLMP_OFF 0x0
-#define DEFLMP_OFF 0x8
+#define REFLMP_OFF 0x0
+#define DEFLMP_OFF 0x8
#define SYMNDX_OFF 0x10
#define SBFLAGS_OFF 0x14
#define SYMDEF_OFF 0x18
#define SYMDEF_VALUE_OFF 0x20
+
/*
- * Local stack space storage for elf_plt_trace is allocated
- * as follows:
- *
- * First - before we got here - %rsp has been decremented
- * by 0x10 to make space for the dyndata ptr (and another
- * free word). In addition to that, we create space
- * for the following:
+ * Next, we need to create a bunch of local storage. First, we have to preserve
+ * the standard registers per the amd64 ABI. This means we need to deal with:
+ * %rax - Used for information about the number of vector arguments
+ * %rdi - arg0
+ * %rsi - arg1
+ * %rdx - arg2
+ * %rcx - arg3
+ * %r8 - arg4
+ * %r9 - arg5
+ * %r10 - static chain pointer
+ * %r11 - PLT Interwork register, our caller is using this, so it's not
+ * a temporary for us.
*
- * La_amd64_regs 8 * 8: 64
- * prev_stack_size 8 8
- * Saved regs:
- * %rdi 8
- * %rsi 8
- * %rdx 8
- * %rcx 8
- * %r8 8
- * %r9 8
- * %r10 8
- * %r11 8
- * %rax 8
- * =======
- * Subtotal: 144 (64byte aligned)
- *
- * The amd64 ABI says that the first 8 floating point registers are used to
- * pass arguments. Because we may end up loading a DSO which has an _init
- * section which uses these registers, we must save them. While various
- * CPUs have different sized floating point registers, the largest are
- * AVX512 512-bit (64-byte) %zmm registers. These need to be 64-byte aligned.
- * Although the example below is showing offsets for %rbp, we actually save the
- * registers offset from %rsp, which is forced into 64-bit alignment.
- *
- * Saved Media Regs (used to pass floating point args):
- * %zmm0 - %zmm7 64 * 8: 512
- * =======
- * Total: 656 (64byte aligned)
- *
- * So - will subtract the following to create enough space
- *
- * -8(%rbp) store dyndata ptr
- * -16(%rbp) store call destination
- * -80(%rbp) space for La_amd64_regs
- * -88(%rbp) prev stack size
- * The next %rbp offsets are only true if the caller had correct stack
- * alignment. See note above SPRDIOFF for why we use %rsp alignment to
- * access these stack fields.
- * -96(%rbp) entering %rdi
- * -104(%rbp) entering %rsi
- * -112(%rbp) entering %rdx
- * -120(%rbp) entering %rcx
- * -128(%rbp) entering %r8
- * -136(%rbp) entering %r9
- * -144(%rbp) entering %r10
- * -152(%rbp) entering %r11
- * -160(%rbp) entering %rax
- * -224(%rbp) entering %xmm0
- * -288(%rbp) entering %xmm1
- * -352(%rbp) entering %xmm2
- * -288(%rbp) entering %xmm3
- * -416(%rbp) entering %xmm4
- * -480(%rbp) entering %xmm5
- * -544(%rbp) entering %xmm6
- * -608(%rbp) entering %xmm7
+ * In addition, we need to save the amd64 ABI floating point arguments. Finally,
+ * we need to deal with our local storage. We need a La_amd64_regs and a
+ * uint64_t for the previous stack size.
*
+ * To deal with this and the potentially variable size of the FPU regs, we have
+ * to play a few different games. We refer to all of the standard registers, the
+ * previous stack size, and La_amd64_regs structure off of %rbp. These are all
+ * values that are below %rbp.
*/
-#define SPDYNOFF -8
-#define SPDESTOFF -16
-#define SPLAREGOFF -80
-#define SPPRVSTKOFF -88
-
-/*
- * The next set of offsets are relative to %rsp.
- * We guarantee %rsp is 64-byte aligned. This guarantees the zmm registers are
- * saved to 64-byte aligned addresses.
- * %rbp may only be 8 byte aligned if we came in from non-ABI compliant code.
- */
-#define SPRDIOFF 576
-#define SPRSIOFF 568
-#define SPRDXOFF 560
-#define SPRCXOFF 552
-#define SPR8OFF 544
-#define SPR9OFF 536
-#define SPR10OFF 528
-#define SPR11OFF 520
-#define SPRAXOFF 512
-#define SPXMM0OFF 448
-#define SPXMM1OFF 384
-#define SPXMM2OFF 320
-#define SPXMM3OFF 256
-#define SPXMM4OFF 192
-#define SPXMM5OFF 128
-#define SPXMM6OFF 64
-#define SPXMM7OFF 0
-
- /* See elf_rtbndr for explanation behind org_scapset */
- .extern org_scapset
- .globl elf_plt_trace
- .type elf_plt_trace,@function
- .align 16
-elf_plt_trace:
+#define SPDYNOFF -8
+#define SPDESTOFF -16
+#define SPPRVSTKOFF -24
+#define SPLAREGOFF -88
+#define ORIG_RDI -96
+#define ORIG_RSI -104
+#define ORIG_RDX -112
+#define ORIG_RCX -120
+#define ORIG_R8 -128
+#define ORIG_R9 -136
+#define ORIG_R10 -144
+#define ORIG_R11 -152
+#define ORIG_RAX -160
+#define PLT_SAVE_OFF 168
+
+ ENTRY(elf_plt_trace)
/*
- * Enforce 64-byte stack alignment here.
- * The next andq instruction essentially does this:
- * if necessary, subtract N bytes from %rsp to align on 64 bytes.
+ * Save our static registers. After that 64-byte align us and subtract
+ * the appropriate amount for the FPU. The frame pointer has already
+ * been pushed for us by the glue code.
*/
- andq $-64, %rsp /* enforce 64-byte stack alignment */
- subq $656,%rsp / create some local storage
-
- movq %rdi, SPRDIOFF(%rsp)
- movq %rsi, SPRSIOFF(%rsp)
- movq %rdx, SPRDXOFF(%rsp)
- movq %rcx, SPRCXOFF(%rsp)
- movq %r8, SPR8OFF(%rsp)
- movq %r9, SPR9OFF(%rsp)
- movq %r10, SPR10OFF(%rsp)
- movq %r11, SPR11OFF(%rsp)
- movq %rax, SPRAXOFF(%rsp)
-
- movq org_scapset@GOTPCREL(%rip),%r9
- movq (%r9),%r9 /* Syscapset_t pointer */
- movl 8(%r9),%edx /* sc_hw_2 */
- testl $AV_386_2_AVX512F,%edx
- jne .trace_save_zmm
- movl (%r9),%edx /* sc_hw_1 */
- testl $AV_386_AVX,%edx
- jne .trace_save_ymm
-
-.trace_save_xmm:
- movdqa %xmm0, SPXMM0OFF(%rsp)
- movdqa %xmm1, SPXMM1OFF(%rsp)
- movdqa %xmm2, SPXMM2OFF(%rsp)
- movdqa %xmm3, SPXMM3OFF(%rsp)
- movdqa %xmm4, SPXMM4OFF(%rsp)
- movdqa %xmm5, SPXMM5OFF(%rsp)
- movdqa %xmm6, SPXMM6OFF(%rsp)
- movdqa %xmm7, SPXMM7OFF(%rsp)
- jmp .trace_save_finish
-
-.trace_save_ymm:
- vmovdqa %ymm0, SPXMM0OFF(%rsp)
- vmovdqa %ymm1, SPXMM1OFF(%rsp)
- vmovdqa %ymm2, SPXMM2OFF(%rsp)
- vmovdqa %ymm3, SPXMM3OFF(%rsp)
- vmovdqa %ymm4, SPXMM4OFF(%rsp)
- vmovdqa %ymm5, SPXMM5OFF(%rsp)
- vmovdqa %ymm6, SPXMM6OFF(%rsp)
- vmovdqa %ymm7, SPXMM7OFF(%rsp)
- jmp .trace_save_finish
-
-.trace_save_zmm:
- vmovdqa64 %zmm0, SPXMM0OFF(%rsp)
- vmovdqa64 %zmm1, SPXMM1OFF(%rsp)
- vmovdqa64 %zmm2, SPXMM2OFF(%rsp)
- vmovdqa64 %zmm3, SPXMM3OFF(%rsp)
- vmovdqa64 %zmm4, SPXMM4OFF(%rsp)
- vmovdqa64 %zmm5, SPXMM5OFF(%rsp)
- vmovdqa64 %zmm6, SPXMM6OFF(%rsp)
- vmovdqa64 %zmm7, SPXMM7OFF(%rsp)
-
-.trace_save_finish:
+ movq %rdi, ORIG_RDI(%rbp)
+ movq %rsi, ORIG_RSI(%rbp)
+ movq %rdx, ORIG_RDX(%rbp)
+ movq %rcx, ORIG_RCX(%rbp)
+ movq %r8, ORIG_R8(%rbp)
+ movq %r9, ORIG_R9(%rbp)
+ movq %r10, ORIG_R10(%rbp)
+ movq %r11, ORIG_R11(%rbp)
+ movq %rax, ORIG_RAX(%rbp)
+
+ subq $PLT_SAVE_OFF, %rsp
+
+ movq _plt_save_size@GOTPCREL(%rip),%r9
+ movq _plt_fp_save@GOTPCREL(%rip),%r10
+ subq (%r9), %rsp
+ andq $-64, %rsp
+ movq %rsp, %rdi
+ call *(%r10)
+ /*
+ * Now that we've saved all of our registers, figure out what we need to
+ * do next.
+ */
movq SPDYNOFF(%rbp), %rax / %rax = dyndata
testb $LA_SYMB_NOPLTENTER, SBFLAGS_OFF(%rax) / <link.h>
je .start_pltenter
@@ -262,17 +396,17 @@ elf_plt_trace:
movq %rdi, 0(%rsi) / la_rsp
movq 0(%rbp), %rdi
movq %rdi, 8(%rsi) / la_rbp
- movq SPRDIOFF(%rsp), %rdi
+ movq ORIG_RDI(%rbp), %rdi
movq %rdi, 16(%rsi) / la_rdi
- movq SPRSIOFF(%rsp), %rdi
+ movq ORIG_RSI(%rbp), %rdi
movq %rdi, 24(%rsi) / la_rsi
- movq SPRDXOFF(%rsp), %rdi
+ movq ORIG_RDX(%rbp), %rdi
movq %rdi, 32(%rsi) / la_rdx
- movq SPRCXOFF(%rsp), %rdi
+ movq ORIG_RCX(%rbp), %rdi
movq %rdi, 40(%rsi) / la_rcx
- movq SPR8OFF(%rsp), %rdi
+ movq ORIG_R8(%rbp), %rdi
movq %rdi, 48(%rsi) / la_r8
- movq SPR9OFF(%rsp), %rdi
+ movq ORIG_R9(%rbp), %rdi
movq %rdi, 56(%rsi) / la_r9
/*
@@ -316,60 +450,21 @@ elf_plt_trace:
movq SPDESTOFF(%rbp), %r11 / r11 == calling destination
movq %r11, 0(%rbp) / store destination at top
- /
- / Restore registers
- /
- movq org_scapset@GOTPCREL(%rip),%r9
- movq (%r9),%r9 /* Syscapset_t pointer */
- movl 8(%r9),%edx /* sc_hw_2 */
- testl $AV_386_2_AVX512F,%edx
- jne .trace_restore_zmm
- movl (%r9),%edx /* sc_hw_1 */
- testl $AV_386_AVX,%edx
- jne .trace_restore_ymm
-
-.trace_restore_xmm:
- movdqa SPXMM0OFF(%rsp), %xmm0
- movdqa SPXMM1OFF(%rsp), %xmm1
- movdqa SPXMM2OFF(%rsp), %xmm2
- movdqa SPXMM3OFF(%rsp), %xmm3
- movdqa SPXMM4OFF(%rsp), %xmm4
- movdqa SPXMM5OFF(%rsp), %xmm5
- movdqa SPXMM6OFF(%rsp), %xmm6
- movdqa SPXMM7OFF(%rsp), %xmm7
- jmp .trace_restore_finish
-
-.trace_restore_ymm:
- vmovdqa SPXMM0OFF(%rsp), %ymm0
- vmovdqa SPXMM1OFF(%rsp), %ymm1
- vmovdqa SPXMM2OFF(%rsp), %ymm2
- vmovdqa SPXMM3OFF(%rsp), %ymm3
- vmovdqa SPXMM4OFF(%rsp), %ymm4
- vmovdqa SPXMM5OFF(%rsp), %ymm5
- vmovdqa SPXMM6OFF(%rsp), %ymm6
- vmovdqa SPXMM7OFF(%rsp), %ymm7
- jmp .trace_restore_finish
-
-.trace_restore_zmm:
- vmovdqa64 SPXMM0OFF(%rsp), %zmm0
- vmovdqa64 SPXMM1OFF(%rsp), %zmm1
- vmovdqa64 SPXMM2OFF(%rsp), %zmm2
- vmovdqa64 SPXMM3OFF(%rsp), %zmm3
- vmovdqa64 SPXMM4OFF(%rsp), %zmm4
- vmovdqa64 SPXMM5OFF(%rsp), %zmm5
- vmovdqa64 SPXMM6OFF(%rsp), %zmm6
- vmovdqa64 SPXMM7OFF(%rsp), %zmm7
-
-.trace_restore_finish:
- movq SPRDIOFF(%rsp), %rdi
- movq SPRSIOFF(%rsp), %rsi
- movq SPRDXOFF(%rsp), %rdx
- movq SPRCXOFF(%rsp), %rcx
- movq SPR8OFF(%rsp), %r8
- movq SPR9OFF(%rsp), %r9
- movq SPR10OFF(%rsp), %r10
- movq SPR11OFF(%rsp), %r11
- movq SPRAXOFF(%rsp), %rax
+ /* Restore FPU */
+ movq _plt_fp_restore@GOTPCREL(%rip),%r10
+
+ movq %rsp, %rdi
+ call *(%r10)
+
+ movq ORIG_RDI(%rbp), %rdi
+ movq ORIG_RSI(%rbp), %rsi
+ movq ORIG_RDX(%rbp), %rdx
+ movq ORIG_RCX(%rbp), %rcx
+ movq ORIG_R8(%rbp), %r8
+ movq ORIG_R9(%rbp), %r9
+ movq ORIG_R10(%rbp), %r10
+ movq ORIG_R11(%rbp), %r11
+ movq ORIG_RAX(%rbp), %rax
subq $8, %rbp / adjust %rbp for 'ret'
movq %rbp, %rsp /
@@ -417,13 +512,10 @@ elf_plt_trace:
/*
* Grow the stack and duplicate the arguements of the
* original caller.
- *
- * We save %rsp in %r11 since we need to use the current rsp for
- * accessing the registers saved in our stack frame.
*/
.grow_stack:
movq %rsp, %r11
- subq %rdx, %rsp / grow the stack
+ subq %rdx, %rsp / grow the stack
movq %rdx, SPPRVSTKOFF(%rbp) / -88(%rbp) == prev frame sz
movq %rsp, %rcx / %rcx = dest
addq %rcx, %rdx / %rdx == tail of dest
@@ -445,59 +537,20 @@ elf_plt_trace:
/ Restore registers using %r11 which contains our old %rsp value
/ before growing the stack.
/
-
- / Yes, we have to do this dance again. Sorry.
- movq org_scapset@GOTPCREL(%rip),%r9
- movq (%r9),%r9 /* Syscapset_t pointer */
- movl 8(%r9),%edx /* sc_hw_2 */
- testl $AV_386_2_AVX512F,%edx
- jne .trace_r2_zmm
- movl (%r9),%edx /* sc_hw_1 */
- testl $AV_386_AVX,%edx
- jne .trace_r2_ymm
-
-.trace_r2_xmm:
- movdqa SPXMM0OFF(%r11), %xmm0
- movdqa SPXMM1OFF(%r11), %xmm1
- movdqa SPXMM2OFF(%r11), %xmm2
- movdqa SPXMM3OFF(%r11), %xmm3
- movdqa SPXMM4OFF(%r11), %xmm4
- movdqa SPXMM5OFF(%r11), %xmm5
- movdqa SPXMM6OFF(%r11), %xmm6
- movdqa SPXMM7OFF(%r11), %xmm7
- jmp .trace_r2_finish
-
-.trace_r2_ymm:
- vmovdqa SPXMM0OFF(%r11), %ymm0
- vmovdqa SPXMM1OFF(%r11), %ymm1
- vmovdqa SPXMM2OFF(%r11), %ymm2
- vmovdqa SPXMM3OFF(%r11), %ymm3
- vmovdqa SPXMM4OFF(%r11), %ymm4
- vmovdqa SPXMM5OFF(%r11), %ymm5
- vmovdqa SPXMM6OFF(%r11), %ymm6
- vmovdqa SPXMM7OFF(%r11), %ymm7
- jmp .trace_r2_finish
-
-.trace_r2_zmm:
- vmovdqa64 SPXMM0OFF(%r11), %zmm0
- vmovdqa64 SPXMM1OFF(%r11), %zmm1
- vmovdqa64 SPXMM2OFF(%r11), %zmm2
- vmovdqa64 SPXMM3OFF(%r11), %zmm3
- vmovdqa64 SPXMM4OFF(%r11), %zmm4
- vmovdqa64 SPXMM5OFF(%r11), %zmm5
- vmovdqa64 SPXMM6OFF(%r11), %zmm6
- vmovdqa64 SPXMM7OFF(%r11), %zmm7
+ movq _plt_fp_restore@GOTPCREL(%rip),%r10
+ movq %r11, %rdi
+ call *(%r10)
.trace_r2_finish:
- movq SPRDIOFF(%r11), %rdi
- movq SPRSIOFF(%r11), %rsi
- movq SPRDXOFF(%r11), %rdx
- movq SPRCXOFF(%r11), %rcx
- movq SPR8OFF(%r11), %r8
- movq SPR9OFF(%r11), %r9
- movq SPR10OFF(%r11), %r10
- movq SPRAXOFF(%r11), %rax
- movq SPR11OFF(%r11), %r11 / retore %r11 last
+ movq ORIG_RDI(%rbp), %rdi
+ movq ORIG_RSI(%rbp), %rsi
+ movq ORIG_RDX(%rbp), %rdx
+ movq ORIG_RCX(%rbp), %rcx
+ movq ORIG_R8(%rbp), %r8
+ movq ORIG_R9(%rbp), %r9
+ movq ORIG_R10(%rbp), %r10
+ movq ORIG_RAX(%rbp), %rax
+ movq ORIG_R11(%rbp), %r11
/*
* Call to desitnation function - we'll return here
@@ -517,72 +570,32 @@ elf_plt_trace:
movq REFLMP_OFF(%r11), %rsi / arg2 (rlmp)
movq %rax, %rdi / arg1 (returnval)
call audit_pltexit@PLT
-
+
/*
* Clean up after ourselves and return to the
- * original calling procedure.
+ * original calling procedure. Make sure to restore
+ * registers.
*/
- /
- / Restore registers
- /
-
- movq org_scapset@GOTPCREL(%rip),%r9
- movq (%r9),%r9 /* Syscapset_t pointer */
- movl 8(%r9),%edx /* sc_hw_2 */
- testl $AV_386_2_AVX512F,%edx
- jne .trace_r3_zmm
- movl (%r9),%edx /* sc_hw_1 */
- testl $AV_386_AVX,%edx
- jne .trace_r3_ymm
-
-.trace_r3_xmm:
- movdqa SPXMM0OFF(%rsp), %xmm0
- movdqa SPXMM1OFF(%rsp), %xmm1
- movdqa SPXMM2OFF(%rsp), %xmm2
- movdqa SPXMM3OFF(%rsp), %xmm3
- movdqa SPXMM4OFF(%rsp), %xmm4
- movdqa SPXMM5OFF(%rsp), %xmm5
- movdqa SPXMM6OFF(%rsp), %xmm6
- movdqa SPXMM7OFF(%rsp), %xmm7
- jmp .trace_r3_finish
-
-.trace_r3_ymm:
- vmovdqa SPXMM0OFF(%rsp), %ymm0
- vmovdqa SPXMM1OFF(%rsp), %ymm1
- vmovdqa SPXMM2OFF(%rsp), %ymm2
- vmovdqa SPXMM3OFF(%rsp), %ymm3
- vmovdqa SPXMM4OFF(%rsp), %ymm4
- vmovdqa SPXMM5OFF(%rsp), %ymm5
- vmovdqa SPXMM6OFF(%rsp), %ymm6
- vmovdqa SPXMM7OFF(%rsp), %ymm7
- jmp .trace_r3_finish
-
-.trace_r3_zmm:
- vmovdqa64 SPXMM0OFF(%rsp), %zmm0
- vmovdqa64 SPXMM1OFF(%rsp), %zmm1
- vmovdqa64 SPXMM2OFF(%rsp), %zmm2
- vmovdqa64 SPXMM3OFF(%rsp), %zmm3
- vmovdqa64 SPXMM4OFF(%rsp), %zmm4
- vmovdqa64 SPXMM5OFF(%rsp), %zmm5
- vmovdqa64 SPXMM6OFF(%rsp), %zmm6
- vmovdqa64 SPXMM7OFF(%rsp), %zmm7
-
-.trace_r3_finish:
- movq SPRDIOFF(%rsp), %rdi
- movq SPRSIOFF(%rsp), %rsi
- movq SPRDXOFF(%rsp), %rdx
- movq SPRCXOFF(%rsp), %rcx
- movq SPR8OFF(%rsp), %r8
- movq SPR9OFF(%rsp), %r9
- movq SPR10OFF(%rsp), %r10
- movq SPR11OFF(%rsp), %r11
- // rax already contains return value
+ movq _plt_fp_restore@GOTPCREL(%rip),%r10
+ movq %rsp, %rdi
+ movq %rax, SPPRVSTKOFF(%rbp)
+ call *(%r10)
+
+ movq ORIG_RDI(%rbp), %rdi
+ movq ORIG_RSI(%rbp), %rsi
+ movq ORIG_RDX(%rbp), %rdx
+ movq ORIG_RCX(%rbp), %rcx
+ movq ORIG_R8(%rbp), %r8
+ movq ORIG_R9(%rbp), %r9
+ movq ORIG_R10(%rbp), %r10
+ movq ORIG_R11(%rbp), %r11
+ movq SPPRVSTKOFF(%rbp), %rax
movq %rbp, %rsp /
popq %rbp /
ret / return to caller
- .size elf_plt_trace, .-elf_plt_trace
+ SET_SIZE(elf_plt_trace)
#endif
/*
@@ -640,227 +653,91 @@ elf_rtbndr(Rt_map * lmp, unsigned long reloc, caddr_t pc)
#define LBRPCOFF 24 /* arg3 - pc of caller */
/*
- * Possible arguments for the resolved function are in registers as per
- * the AMD64 ABI. We must save on the local stack all possible register
- * arguments before interposing functions to resolve the called function.
- * Possible arguments must be restored before invoking the resolved function.
- *
- * Before the AVX and AVX512 instruction set enhancements to AMD64, there were
- * no changes in the set of registers and their sizes across different
- * processors. With AVX, the xmm registers became the lower 128 bits of the ymm
- * registers. With AVX512, the ymm registers became the lower 256 bits of the
- * zmm registers. Because of this, we need to conditionally save either 256 bits
- * or 512 bits, instead of 128 bits. Regardless of whether we have zmm registers
- * or not, we're always going to push the stack space assuming that we do, to
- * simplify the code.
- *
- * Local stack space storage for elf_rtbndr is allocated as follows:
- *
- * Saved regs:
- * %rax 8
- * %rdi 8
- * %rsi 8
- * %rdx 8
- * %rcx 8
- * %r8 8
- * %r9 8
- * %r10 8
- * =======
- * Subtotal: 64 (64byte aligned)
+ * With the above in place, we must now proceed to preserve all temporary
+ * registers that are also used for passing arguments. Specifically this
+ * means:
*
- * Saved Media Regs (used to pass floating point args):
- * %zmm0 - %zmm7 64 * 8 512
- * =======
- * Total: 576 (64byte aligned)
- *
- * So - will subtract the following to create enough space
+ * %rax - Used for information about the number of vector arguments
+ * %rdi - arg0
+ * %rsi - arg1
+ * %rdx - arg2
+ * %rcx - arg3
+ * %r8 - arg4
+ * %r9 - arg5
+ * %r10 - static chain pointer
*
- * 0(%rsp) save %rax
- * 8(%rsp) save %rdi
- * 16(%rsp) save %rsi
- * 24(%rsp) save %rdx
- * 32(%rsp) save %rcx
- * 40(%rsp) save %r8
- * 48(%rsp) save %r9
- * 56(%rsp) save %r10
- * 64(%rsp) save %zmm0
- * 128(%rsp) save %zmm1
- * 192(%rsp) save %zmm2
- * 256(%rsp) save %zmm3
- * 320(%rsp) save %zmm4
- * 384(%rsp) save %zmm5
- * 448(%rsp) save %zmm6
- * 512(%rsp) save %zmm7
- *
- * Note: Some callers may use 8-byte stack alignment. We use %rsp offsets to
- * save/restore registers because %rbp may not be 64-byte aligned. We guarantee
- * %rsp is 64-byte aligned in the function preamble.
+ * While we don't have to preserve %r11, we do have to preserve the FPU
+ * registers. The FPU logic is delegated to a specific function that we'll call.
+ * However, it requires that its stack is 64-byte aligned. We defer the
+ * alignment to that point. This will also take care of the fact that a caller
+ * may not call us with a correctly aligned stack pointer per the amd64 ABI.
*/
-/*
- * As the registers may either be xmm, ymm or zmm, we've left the name as xmm,
- * but increased the offset between them to always cover the zmm case.
- */
-#define LS_SIZE $576 /* local stack space to save all possible arguments */
-#define LSRAXOFF 0 /* for SSE register count */
-#define LSRDIOFF 8 /* arg 0 ... */
-#define LSRSIOFF 16
-#define LSRDXOFF 24
-#define LSRCXOFF 32
-#define LSR8OFF 40
-#define LSR9OFF 48
-#define LSR10OFF 56 /* ... arg 5 */
-#define LSXMM0OFF 64 /* SSE arg 0 ... */
-#define LSXMM1OFF 128
-#define LSXMM2OFF 192
-#define LSXMM3OFF 256
-#define LSXMM4OFF 320
-#define LSXMM5OFF 384
-#define LSXMM6OFF 448
-#define LSXMM7OFF 512 /* ... SSE arg 7 */
- /*
- * The org_scapset is a global variable that is a part of rtld. It
- * contains the capabilities that the kernel has told us are supported
- * (auxv_hwcap). This is necessary for determining whether or not we
- * need to save and restore AVX registers or simple SSE registers. Note,
- * that the field we care about is currently at offset 0, if that
- * changes, this code will have to be updated.
- */
- .extern org_scapset
+ .extern _plt_save_size
+ .extern _plt_fp_save
+ .extern plt_fp_restore
+
.weak _elf_rtbndr
_elf_rtbndr = elf_rtbndr
ENTRY(elf_rtbndr)
-
- pushq %rbp
+ pushq %rbp /* Establish stack frame */
movq %rsp, %rbp
/*
- * Some libraries may (incorrectly) use non-ABI compliant 8-byte stack
- * alignment. Enforce 64-byte stack alignment here.
- * The next andq instruction essentially does this:
- * if necessary, subtract N bytes from %rsp to align on 64 bytes.
+ * Save basic regs.
*/
- andq $-64, %rsp /* enforce 64-byte stack alignment */
-
- subq LS_SIZE, %rsp /* save all ABI defined argument registers */
-
- movq %rax, LSRAXOFF(%rsp) /* for SSE register count */
- movq %rdi, LSRDIOFF(%rsp) /* arg 0 .. */
- movq %rsi, LSRSIOFF(%rsp)
- movq %rdx, LSRDXOFF(%rsp)
- movq %rcx, LSRCXOFF(%rsp)
- movq %r8, LSR8OFF(%rsp)
- movq %r9, LSR9OFF(%rsp) /* .. arg 5 */
- movq %r10, LSR10OFF(%rsp) /* call chain reg */
+ pushq %rax
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r12
/*
- * Our xmm registers could secretly be ymm or zmm registers in disguise.
+ * Save the amount of space we need for the FPU registers and call that
+ * function. Save %rsp before we manipulate it to make restore easier.
*/
- movq org_scapset@GOTPCREL(%rip),%r9
- movq (%r9),%r9 /* Syscapset_t pointer */
- movl 8(%r9),%edx /* sc_hw_2 */
- testl $AV_386_2_AVX512F,%edx
- jne .save_zmm
- movl (%r9),%edx /* sc_hw_1 */
- testl $AV_386_AVX,%edx
- jne .save_ymm
-
-.save_xmm:
- movdqa %xmm0, LSXMM0OFF(%rsp) /* SSE arg 0 ... */
- movdqa %xmm1, LSXMM1OFF(%rsp)
- movdqa %xmm2, LSXMM2OFF(%rsp)
- movdqa %xmm3, LSXMM3OFF(%rsp)
- movdqa %xmm4, LSXMM4OFF(%rsp)
- movdqa %xmm5, LSXMM5OFF(%rsp)
- movdqa %xmm6, LSXMM6OFF(%rsp)
- movdqa %xmm7, LSXMM7OFF(%rsp) /* ... SSE arg 7 */
- jmp .save_finish
+ movq %rsp, %r12
+ movq _plt_save_size@GOTPCREL(%rip),%r9
+ movq _plt_fp_save@GOTPCREL(%rip),%r10
+ subq (%r9), %rsp
+ andq $-64, %rsp
-.save_ymm:
- vmovdqa %ymm0, LSXMM0OFF(%rsp) /* SSE arg 0 ... */
- vmovdqa %ymm1, LSXMM1OFF(%rsp)
- vmovdqa %ymm2, LSXMM2OFF(%rsp)
- vmovdqa %ymm3, LSXMM3OFF(%rsp)
- vmovdqa %ymm4, LSXMM4OFF(%rsp)
- vmovdqa %ymm5, LSXMM5OFF(%rsp)
- vmovdqa %ymm6, LSXMM6OFF(%rsp)
- vmovdqa %ymm7, LSXMM7OFF(%rsp) /* ... SSE arg 7 */
- jmp .save_finish
+ movq %rsp, %rdi
+ call *(%r10)
-.save_zmm:
- vmovdqa64 %zmm0, LSXMM0OFF(%rsp) /* SSE arg 0 ... */
- vmovdqa64 %zmm1, LSXMM1OFF(%rsp)
- vmovdqa64 %zmm2, LSXMM2OFF(%rsp)
- vmovdqa64 %zmm3, LSXMM3OFF(%rsp)
- vmovdqa64 %zmm4, LSXMM4OFF(%rsp)
- vmovdqa64 %zmm5, LSXMM5OFF(%rsp)
- vmovdqa64 %zmm6, LSXMM6OFF(%rsp)
- vmovdqa64 %zmm7, LSXMM7OFF(%rsp) /* ... SSE arg 7 */
-
-.save_finish:
+ /*
+ * Perform actual PLT logic. Note that the plt related arguments are
+ * located at an offset relative to %rbp.
+ */
movq LBPLMPOFF(%rbp), %rdi /* arg1 - *lmp */
movq LBPRELOCOFF(%rbp), %rsi /* arg2 - reloc index */
movq LBRPCOFF(%rbp), %rdx /* arg3 - pc of caller */
call elf_bndr@PLT /* call elf_rtbndr(lmp, relndx, pc) */
movq %rax, LBPRELOCOFF(%rbp) /* store final destination */
- /*
- * Restore possible arguments before invoking resolved function. We
- * check the xmm, ymm, vs. zmm regs first so we can use the others.
- */
- movq org_scapset@GOTPCREL(%rip),%r9
- movq (%r9),%r9 /* Syscapset_t pointer */
- movl 8(%r9),%edx /* sc_hw_2 */
- testl $AV_386_2_AVX512F,%edx
- jne .restore_zmm
- movl (%r9),%edx /* sc_hw_1 */
- testl $AV_386_AVX,%edx
- jne .restore_ymm
+ /* Restore FPU */
+ movq _plt_fp_restore@GOTPCREL(%rip),%r10
-.restore_xmm:
- movdqa LSXMM0OFF(%rsp), %xmm0
- movdqa LSXMM1OFF(%rsp), %xmm1
- movdqa LSXMM2OFF(%rsp), %xmm2
- movdqa LSXMM3OFF(%rsp), %xmm3
- movdqa LSXMM4OFF(%rsp), %xmm4
- movdqa LSXMM5OFF(%rsp), %xmm5
- movdqa LSXMM6OFF(%rsp), %xmm6
- movdqa LSXMM7OFF(%rsp), %xmm7
- jmp .restore_finish
+ movq %rsp, %rdi
+ call *(%r10)
-.restore_ymm:
- vmovdqa LSXMM0OFF(%rsp), %ymm0
- vmovdqa LSXMM1OFF(%rsp), %ymm1
- vmovdqa LSXMM2OFF(%rsp), %ymm2
- vmovdqa LSXMM3OFF(%rsp), %ymm3
- vmovdqa LSXMM4OFF(%rsp), %ymm4
- vmovdqa LSXMM5OFF(%rsp), %ymm5
- vmovdqa LSXMM6OFF(%rsp), %ymm6
- vmovdqa LSXMM7OFF(%rsp), %ymm7
- jmp .restore_finish
+ movq %r12, %rsp
+ popq %r12
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ popq %rax
-.restore_zmm:
- vmovdqa64 LSXMM0OFF(%rsp), %zmm0
- vmovdqa64 LSXMM1OFF(%rsp), %zmm1
- vmovdqa64 LSXMM2OFF(%rsp), %zmm2
- vmovdqa64 LSXMM3OFF(%rsp), %zmm3
- vmovdqa64 LSXMM4OFF(%rsp), %zmm4
- vmovdqa64 LSXMM5OFF(%rsp), %zmm5
- vmovdqa64 LSXMM6OFF(%rsp), %zmm6
- vmovdqa64 LSXMM7OFF(%rsp), %zmm7
-
-.restore_finish:
- movq LSRAXOFF(%rsp), %rax
- movq LSRDIOFF(%rsp), %rdi
- movq LSRSIOFF(%rsp), %rsi
- movq LSRDXOFF(%rsp), %rdx
- movq LSRCXOFF(%rsp), %rcx
- movq LSR8OFF(%rsp), %r8
- movq LSR9OFF(%rsp), %r9
- movq LSR10OFF(%rsp), %r10
-
- movq %rbp, %rsp
+ movq %rbp, %rsp /* Restore our stack frame */
popq %rbp
addq $8, %rsp /* pop 1st plt-pushed args */
@@ -869,5 +746,6 @@ elf_rtbndr(Rt_map * lmp, unsigned long reloc, caddr_t pc)
/* final destination */
ret /* invoke resolved function */
- .size elf_rtbndr, .-elf_rtbndr
+
+ SET_SIZE(elf_rtbndr)
#endif