/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include "assym.h" /* * Pseudo-code to aid in understanding the control flow of the * bcopy/copyin/copyout routines. * * On entry: * * ! Determine whether to use the FP register version * ! or the leaf routine version depending on size * ! of copy and flags. Set up error handling accordingly. * ! The transition point depends on whether the src and * ! dst addresses can be aligned to long word, word, * ! half word, or byte boundaries. * ! * ! WARNING: * ! For FP version, %l6 holds previous error handling and * ! a flag: TRAMP_FLAG (low bits) * ! for leaf routine version, %o4 holds those values. * ! So either %l6 or %o4 is reserved and not available for * ! any other use. * * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test * go to small_copy; ! to speed short copies * * ! src, dst long word alignable * if (hw_copy_limit_8 == 0) ! hw_copy disabled * go to small_copy; * if (length <= hw_copy_limit_8) * go to small_copy; * go to FPBLK_copy; * } * if (src,dst not alignable) { * if (hw_copy_limit_1 == 0) ! hw_copy disabled * go to small_copy; * if (length <= hw_copy_limit_1) * go to small_copy; * go to FPBLK_copy; * } * if (src,dst halfword alignable) { * if (hw_copy_limit_2 == 0) ! hw_copy disabled * go to small_copy; * if (length <= hw_copy_limit_2) * go to small_copy; * go to FPBLK_copy; * } * if (src,dst word alignable) { * if (hw_copy_limit_4 == 0) ! hw_copy disabled * go to small_copy; * if (length <= hw_copy_limit_4) * go to small_copy; * go to FPBLK_copy; * } * * small_copy: * Setup_leaf_rtn_error_handler; ! diffs for each entry point * * if (count <= 3) ! fast path for tiny copies * go to sm_left; ! special finish up code * else * if (count > CHKSIZE) ! medium sized copies * go to sm_med ! tuned by alignment * if(src&dst not both word aligned) { * sm_movebytes: * move byte by byte in 4-way unrolled loop * fall into sm_left; * sm_left: * move 0-3 bytes byte at a time as needed. * restore error handler and exit. * * } else { ! src&dst are word aligned * check for at least 8 bytes left, * move word at a time, unrolled by 2 * when fewer than 8 bytes left, * sm_half: move half word at a time while 2 or more bytes left * sm_byte: move final byte if necessary * sm_exit: * restore error handler and exit. * } * * ! Medium length cases with at least CHKSIZE bytes available * ! method: line up src and dst as best possible, then * ! move data in 4-way unrolled loops. * * sm_med: * if(src&dst unalignable) * go to sm_movebytes * if(src&dst halfword alignable) * go to sm_movehalf * if(src&dst word alignable) * go to sm_moveword * ! fall into long word movement * move bytes until src is word aligned * if not long word aligned, move a word * move long words in 4-way unrolled loop until < 32 bytes left * move long words in 1-way unrolled loop until < 8 bytes left * if zero bytes left, goto sm_exit * if one byte left, go to sm_byte * else go to sm_half * * sm_moveword: * move bytes until src is word aligned * move words in 4-way unrolled loop until < 16 bytes left * move words in 1-way unrolled loop until < 4 bytes left * if zero bytes left, goto sm_exit * if one byte left, go to sm_byte * else go to sm_half * * sm_movehalf: * move a byte if needed to align src on halfword * move halfwords in 4-way unrolled loop until < 8 bytes left * if zero bytes left, goto sm_exit * if one byte left, go to sm_byte * else go to sm_half * * * FPBLK_copy: * %l6 = curthread->t_lofault; * if (%l6 != NULL) { * membar #Sync * curthread->t_lofault = .copyerr; * caller_error_handler = TRUE ! %l6 |= 2 * } * * ! for FPU testing we must not migrate cpus * if (curthread->t_lwp == NULL) { * ! Kernel threads do not have pcb's in which to store * ! the floating point state, so disallow preemption during * ! the copy. This also prevents cpu migration. * kpreempt_disable(curthread); * } else { * thread_nomigrate(); * } * * old_fprs = %fprs; * old_gsr = %gsr; * if (%fprs.fef) { * %fprs.fef = 1; * save current fpregs on stack using blockstore * } else { * %fprs.fef = 1; * } * * * do_blockcopy_here; * * In lofault handler: * curthread->t_lofault = .copyerr2; * Continue on with the normal exit handler * * On normal exit: * %gsr = old_gsr; * if (old_fprs & FPRS_FEF) * restore fpregs from stack using blockload * else * zero fpregs * %fprs = old_fprs; * membar #Sync * curthread->t_lofault = (%l6 & ~3); * ! following test omitted from copyin/copyout as they * ! will always have a current thread * if (curthread->t_lwp == NULL) * kpreempt_enable(curthread); * else * thread_allowmigrate(); * return (0) * * In second lofault handler (.copyerr2): * We've tried to restore fp state from the stack and failed. To * prevent from returning with a corrupted fp state, we will panic. */ /* * Comments about optimization choices * * The initial optimization decision in this code is to determine * whether to use the FP registers for a copy or not. If we don't * use the FP registers, we can execute the copy as a leaf routine, * saving a register save and restore. Also, less elaborate setup * is required, allowing short copies to be completed more quickly. * For longer copies, especially unaligned ones (where the src and * dst do not align to allow simple ldx,stx operation), the FP * registers allow much faster copy operations. * * The estimated extra cost of the FP path will vary depending on * src/dst alignment, dst offset from the next 64 byte FPblock store * boundary, remaining src data after the last full dst cache line is * moved whether the FP registers need to be saved, and some other * minor issues. The average additional overhead is estimated to be * 400 clocks. Since each non-repeated/predicted tst and branch costs * around 10 clocks, elaborate calculation would slow down to all * longer copies and only benefit a small portion of medium sized * copies. Rather than incur such cost, we chose fixed transition * points for each of the alignment choices. * * For the inner loop, here is a comparison of the per cache line * costs for each alignment when src&dst are in cache: * * byte aligned: 108 clocks slower for non-FPBLK * half aligned: 44 clocks slower for non-FPBLK * word aligned: 12 clocks slower for non-FPBLK * long aligned: 4 clocks >>faster<< for non-FPBLK * * The long aligned loop runs faster because it does no prefetching. * That wins if the data is not in cache or there is too little * data to gain much benefit from prefetching. But when there * is more data and that data is not in cache, failing to prefetch * can run much slower. In addition, there is a 2 Kbyte store queue * which will cause the non-FPBLK inner loop to slow for larger copies. * The exact tradeoff is strongly load and application dependent, with * increasing risk of a customer visible performance regression if the * non-FPBLK code is used for larger copies. Studies of synthetic in-cache * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe * upper limit for the non-FPBLK code. To minimize performance regression * risk while still gaining the primary benefits of the improvements to * the non-FPBLK code, we set an upper bound of 1024 bytes for the various * hw_copy_limit_*. Later experimental studies using different values * of hw_copy_limit_* can be used to make further adjustments if * appropriate. * * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned * hw_copy_limit_4 = src and dst are word aligned but not longword aligned * hw_copy_limit_8 = src and dst are longword aligned * * To say that src and dst are word aligned means that after * some initial alignment activity of moving 0 to 3 bytes, * both the src and dst will be on word boundaries so that * word loads and stores may be used. * * Recommended initial values as of Mar 2004, includes testing * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz): * hw_copy_limit_1 = 256 * hw_copy_limit_2 = 512 * hw_copy_limit_4 = 1024 * hw_copy_limit_8 = 1024 (or 1536 on some systems) * * * If hw_copy_limit_? is set to zero, then use of FPBLK copy is * disabled for that alignment choice. * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) * the value of VIS_COPY_THRESHOLD is used. * It is not envisioned that hw_copy_limit_? will be changed in the field * It is provided to allow for disabling FPBLK copies and to allow * easy testing of alternate values on future HW implementations * that might have different cache sizes, clock rates or instruction * timing rules. * * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum * threshold to speedup all shorter copies (less than 256). That * saves an alignment test, memory reference, and enabling test * for all short copies, or an estimated 24 clocks. * * The order in which these limits are checked does matter since each * non-predicted tst and branch costs around 10 clocks. * If src and dst are randomly selected addresses, * 4 of 8 will not be alignable. * 2 of 8 will be half word alignable. * 1 of 8 will be word alignable. * 1 of 8 will be long word alignable. * But, tests on running kernels show that src and dst to copy code * are typically not on random alignments. Structure copies and * copies of larger data sizes are often on long word boundaries. * So we test the long word alignment case first, then * the byte alignment, then halfword, then word alignment. * * Several times, tests for length are made to split the code * into subcases. These tests often allow later tests to be * avoided. For example, within the non-FPBLK copy, we first * check for tiny copies of 3 bytes or less. That allows us * to use a 4-way unrolled loop for the general byte copy case * without a test on loop entry. * We subdivide the non-FPBLK case further into CHKSIZE bytes and less * vs longer cases. For the really short case, we don't attempt * align src and dst. We try to minimize special case tests in * the shortest loops as each test adds a significant percentage * to the total time. * * For the medium sized cases, we allow ourselves to adjust the * src and dst alignment and provide special cases for each of * the four adjusted alignment cases. The CHKSIZE that was used * to decide between short and medium size was chosen to be 39 * as that allows for the worst case of 7 bytes of alignment * shift and 4 times 8 bytes for the first long word unrolling. * That knowledge saves an initial test for length on entry into * the medium cases. If the general loop unrolling factor were * to be increases, this number would also need to be adjusted. * * For all cases in the non-FPBLK code where it is known that at * least 4 chunks of data are available for movement, the * loop is unrolled by four. This 4-way loop runs in 8 clocks * or 2 clocks per data element. Due to limitations of the * branch instruction on Cheetah, Jaguar, and Panther, the * minimum time for a small, tight loop is 3 clocks. So * the 4-way loop runs 50% faster than the fastest non-unrolled * loop. * * Instruction alignment is forced by used of .align 16 directives * and nops which are not executed in the code. This * combination of operations shifts the alignment of following * loops to insure that loops are aligned so that their instructions * fall within the minimum number of 4 instruction fetch groups. * If instructions are inserted or removed between the .align * instruction and the unrolled loops, then the alignment needs * to be readjusted. Misaligned loops can add a clock per loop * iteration to the loop timing. * * In a few cases, code is duplicated to avoid a branch. Since * a non-predicted tst and branch takes 10 clocks, this savings * is judged an appropriate time-space tradeoff. * * Within the FPBLK-code, the prefetch method in the inner * loop needs to be explained as it is not standard. Two * prefetches are issued for each cache line instead of one. * The primary one is at the maximum reach of 8 cache lines. * Most of the time, that maximum prefetch reach gives the * cache line more time to reach the processor for systems with * higher processor clocks. But, sometimes memory interference * can cause that prefetch to be dropped. Putting a second * prefetch at a reach of 5 cache lines catches the drops * three iterations later and shows a measured improvement * in performance over any similar loop with a single prefetch. * The prefetches are placed in the loop so they overlap with * non-memory instructions, so that there is no extra cost * when the data is already in-cache. * */ /* * Notes on preserving existing fp state and on membars. * * When a copyOP decides to use fp we may have to preserve existing * floating point state. It is not the caller's state that we need to * preserve - the rest of the kernel does not use fp and, anyway, fp * registers are volatile across a call. Some examples: * * - userland has fp state and is interrupted (device interrupt * or trap) and within the interrupt/trap handling we use * bcopy() * - another (higher level) interrupt or trap handler uses bcopy * while a bcopy from an earlier interrupt is still active * - an asynchronous error trap occurs while fp state exists (in * userland or in kernel copy) and the tl0 component of the handling * uses bcopy * - a user process with fp state incurs a copy-on-write fault and * hwblkpagecopy always uses fp * * We therefore need a per-call place in which to preserve fp state - * using our stack is ideal (and since fp copy cannot be leaf optimized * because of calls it makes, this is no hardship). * * The following membar BLD/BST discussion is Cheetah pipeline specific. * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are * nops (those semantics always apply) and #StoreLoad is implemented * as a membar #Sync. * * It is possible that the owner of the fp state has a block load or * block store still "in flight" at the time we come to preserve that * state. Block loads are blocking in Cheetah pipelines so we do not * need to sync with them. In preserving fp regs we will use block stores * (which are not blocking in Cheetah pipelines) so we require a membar #Sync * after storing state (so that our subsequent use of those registers * does not modify them before the block stores complete); this membar * also serves to sync with block stores the owner of the fp state has * initiated. * * When we have finished fp copy (with it's repeated block stores) * we must membar #Sync so that our block stores may complete before * we either restore the original fp state into the fp registers or * return to a caller which may initiate other fp operations that could * modify the fp regs we used before the block stores complete. * * Synchronous faults (eg, unresolvable DMMU miss) that occur while * t_lofault is not NULL will not panic but will instead trampoline * to the registered lofault handler. There is no need for any * membars for these - eg, our store to t_lofault will always be visible to * ourselves and it is our cpu which will take any trap. * * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur * while t_lofault is not NULL will also not panic. Since we're copying * to or from userland the extent of the damage is known - the destination * buffer is incomplete. So trap handlers will trampoline to the lofault * handler in this case which should take some form of error action to * avoid using the incomplete buffer. The trap handler also flags the * fault so that later return-from-trap handling (for the trap that brought * this thread into the kernel in the first place) can notify the process * and reboot the system (or restart the service with Greenline/Contracts). * * Asynchronous faults (eg, uncorrectable ECC error from memory) can * result in deferred error traps - the trap is taken sometime after * the event and the trap PC may not be the PC of the faulting access. * Delivery of such pending traps can be forced by a membar #Sync, acting * as an "error barrier" in this role. To accurately apply the user/kernel * separation described in the preceding paragraph we must force delivery * of deferred traps affecting kernel state before we install a lofault * handler (if we interpose a new lofault handler on an existing one there * is no need to repeat this), and we must force delivery of deferred * errors affecting the lofault-protected region before we clear t_lofault. * Failure to do so results in lost kernel state being interpreted as * affecting a copyin/copyout only, or of an error that really only * affects copy data being interpreted as losing kernel state. * * Since the copy operations may preserve and later restore floating * point state that does not belong to the caller (see examples above), * we must be careful in how we do this in order to prevent corruption * of another program. * * To make sure that floating point state is always saved and restored * correctly, the following "big rules" must be followed when the floating * point registers will be used: * * 1. %l6 always holds the caller's lofault handler. Also in this register, * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a * lofault handler was set coming in. * * 2. The FPUSED flag indicates that all FP state has been successfully stored * on the stack. It should not be set until this save has been completed. * * 3. The FPUSED flag should not be cleared on exit until all FP state has * been restored from the stack. If an error occurs while restoring * data from the stack, the error handler can check this flag to see if * a restore is necessary. * * 4. Code run under the new lofault handler must be kept to a minimum. In * particular, any calls to FP_ALLOWMIGRATE, which could result in a call * to kpreempt(), should not be made until after the lofault handler has * been restored. */ /* * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed * to "break even" using FP/VIS-accelerated memory operations. * The FPBLK code assumes a minimum number of bytes are available * to be moved on entry. Check that code carefully before * reducing VIS_COPY_THRESHOLD below 256. */ /* * This shadows sys/machsystm.h which can't be included due to the lack of * _ASM guards in include files it references. Change it here, change it there. */ #define VIS_COPY_THRESHOLD 256 /* * TEST for very short copies * Be aware that the maximum unroll for the short unaligned case * is SHORTCOPY+1 */ #define SHORTCOPY 3 #define CHKSIZE 39 /* * Indicates that we're to trampoline to the error handler. * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. */ #define FPUSED_FLAG 1 #define TRAMP_FLAG 2 #define MASK_FLAGS 3 /* * Number of outstanding prefetches. * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a * reach of 5*BLOCK_SIZE. The double prefetch gives an typical improvement * of 5% for large copies as compared to a single prefetch. The reason * for the improvement is that with Cheetah and Jaguar, some prefetches * are dropped due to the prefetch queue being full. The second prefetch * reduces the number of cache lines that are dropped. * Do not remove the double prefetch or change either CHEETAH_PREFETCH * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove * there is no loss of performance. */ #define CHEETAH_PREFETCH 8 #define CHEETAH_2ND_PREFETCH 5 #define VIS_BLOCKSIZE 64 /* * Size of stack frame in order to accomodate a 64-byte aligned * floating-point register save area and 2 64-bit temp locations. * All copy functions use two quadrants of fp registers; to assure a * block-aligned two block buffer in which to save we must reserve * three blocks on stack. Not all functions preserve %pfrs on stack * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. * * _______________________________________ <-- %fp + STACK_BIAS * | We may need to preserve 2 quadrants | * | of fp regs, but since we do so with | * | BST/BLD we need room in which to | * | align to VIS_BLOCKSIZE bytes. So | * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET * |-------------------------------------| * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET * |-------------------------------------| * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET * --------------------------------------- */ #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) /* * Common macros used by the various versions of the block copy * routines in this file. */ /* * In FP copies if we do not have preserved data to restore over * the fp regs we used then we must zero those regs to avoid * exposing portions of the data to later threads (data security). * * Copy functions use either quadrants 1 and 3 or 2 and 4. * * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 * * The instructions below are quicker than repeated fzero instructions * since they can dispatch down two fp pipelines. */ #define FZEROQ1Q3 \ fzero %f0 ;\ fzero %f2 ;\ faddd %f0, %f2, %f4 ;\ fmuld %f0, %f2, %f6 ;\ faddd %f0, %f2, %f8 ;\ fmuld %f0, %f2, %f10 ;\ faddd %f0, %f2, %f12 ;\ fmuld %f0, %f2, %f14 ;\ faddd %f0, %f2, %f32 ;\ fmuld %f0, %f2, %f34 ;\ faddd %f0, %f2, %f36 ;\ fmuld %f0, %f2, %f38 ;\ faddd %f0, %f2, %f40 ;\ fmuld %f0, %f2, %f42 ;\ faddd %f0, %f2, %f44 ;\ fmuld %f0, %f2, %f46 #define FZEROQ2Q4 \ fzero %f16 ;\ fzero %f18 ;\ faddd %f16, %f18, %f20 ;\ fmuld %f16, %f18, %f22 ;\ faddd %f16, %f18, %f24 ;\ fmuld %f16, %f18, %f26 ;\ faddd %f16, %f18, %f28 ;\ fmuld %f16, %f18, %f30 ;\ faddd %f16, %f18, %f48 ;\ fmuld %f16, %f18, %f50 ;\ faddd %f16, %f18, %f52 ;\ fmuld %f16, %f18, %f54 ;\ faddd %f16, %f18, %f56 ;\ fmuld %f16, %f18, %f58 ;\ faddd %f16, %f18, %f60 ;\ fmuld %f16, %f18, %f62 /* * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. * Used to save and restore in-use fp registers when we want to use FP * and find fp already in use and copy size still large enough to justify * the additional overhead of this save and restore. * * A membar #Sync is needed before save to sync fp ops initiated before * the call to the copy function (by whoever has fp in use); for example * an earlier block load to the quadrant we are about to save may still be * "in flight". A membar #Sync is required at the end of the save to * sync our block store (the copy code is about to begin ldd's to the * first quadrant). Note, however, that since Cheetah pipeline block load * is blocking we can omit the initial membar before saving fp state (they're * commented below in case of future porting to a chip that does not block * on block load). * * Similarly: a membar #Sync before restore allows the block stores of * the copy operation to complete before we fill the quadrants with their * original data, and a membar #Sync after restore lets the block loads * of the restore complete before we return to whoever has the fp regs * in use. To avoid repeated membar #Sync we make it the responsibility * of the copy code to membar #Sync immediately after copy is complete * and before using the BLD_*_FROMSTACK macro. */ #define BST_FPQ1Q3_TOSTACK(tmp1) \ /* membar #Sync */ ;\ add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ stda %f0, [tmp1]ASI_BLK_P ;\ add tmp1, VIS_BLOCKSIZE, tmp1 ;\ stda %f32, [tmp1]ASI_BLK_P ;\ membar #Sync #define BLD_FPQ1Q3_FROMSTACK(tmp1) \ /* membar #Sync - provided at copy completion */ ;\ add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ ldda [tmp1]ASI_BLK_P, %f0 ;\ add tmp1, VIS_BLOCKSIZE, tmp1 ;\ ldda [tmp1]ASI_BLK_P, %f32 ;\ membar #Sync #define BST_FPQ2Q4_TOSTACK(tmp1) \ /* membar #Sync */ ;\ add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ stda %f16, [tmp1]ASI_BLK_P ;\ add tmp1, VIS_BLOCKSIZE, tmp1 ;\ stda %f48, [tmp1]ASI_BLK_P ;\ membar #Sync #define BLD_FPQ2Q4_FROMSTACK(tmp1) \ /* membar #Sync - provided at copy completion */ ;\ add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ ldda [tmp1]ASI_BLK_P, %f16 ;\ add tmp1, VIS_BLOCKSIZE, tmp1 ;\ ldda [tmp1]ASI_BLK_P, %f48 ;\ membar #Sync /* * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, * prevent preemption if there is no t_lwp to save FP state to on context * switch) before commencing a FP copy, and reallow it on completion or * in error trampoline paths when we were using FP copy. * * Both macros may call other functions, so be aware that all outputs are * forfeit after using these macros. For this reason we do not pass registers * to use - we just use any outputs we want. * * For fpRAS we need to perform the fpRAS mechanism test on the same * CPU as we use for the copy operation, both so that we validate the * CPU we perform the copy on and so that we know which CPU failed * if a failure is detected. Hence we need to be bound to "our" CPU. * This could be achieved through disabling preemption (and we have do it that * way for threads with no t_lwp) but for larger copies this may hold * higher priority threads off of cpu for too long (eg, realtime). So we * make use of the lightweight t_nomigrate mechanism where we can (ie, when * we have a t_lwp). * * Pseudo code: * * FP_NOMIGRATE: * * if (curthread->t_lwp) { * thread_nomigrate(); * } else { * kpreempt_disable(); * } * * FP_ALLOWMIGRATE: * * if (curthread->t_lwp) { * thread_allowmigrate(); * } else { * kpreempt_enable(); * } */ #define FP_NOMIGRATE(label1, label2) \ ldn [THREAD_REG + T_LWP], %o0 ;\ brz,a,pn %o0, label1/**/f ;\ ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ call thread_nomigrate ;\ nop ;\ ba label2/**/f ;\ nop ;\ label1: ;\ inc %o1 ;\ stb %o1, [THREAD_REG + T_PREEMPT] ;\ label2: #define FP_ALLOWMIGRATE(label1, label2) \ ldn [THREAD_REG + T_LWP], %o0 ;\ brz,a,pn %o0, label1/**/f ;\ ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ call thread_allowmigrate ;\ nop ;\ ba label2/**/f ;\ nop ;\ label1: ;\ dec %o1 ;\ brnz,pn %o1, label2/**/f ;\ stb %o1, [THREAD_REG + T_PREEMPT] ;\ ldn [THREAD_REG + T_CPU], %o0 ;\ ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ brz,pt %o0, label2/**/f ;\ nop ;\ call kpreempt ;\ rdpr %pil, %o0 ;\ label2: /* * Copy a block of storage, returning an error code if `from' or * `to' takes a kernel pagefault which cannot be resolved. * Returns errno value on pagefault error, 0 if all ok */ .seg ".text" .align 4 ENTRY(kcopy) cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case bleu,pt %ncc, .kcopy_small ! go to larger cases xor %o0, %o1, %o3 ! are src, dst alignable? btst 7, %o3 ! bz,pt %ncc, .kcopy_8 ! check for longword alignment nop btst 1, %o3 ! bz,pt %ncc, .kcopy_2 ! check for half-word nop sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_1)], %o3 tst %o3 bz,pn %icc, .kcopy_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .kcopy_small ! go to small copy nop ba,pt %ncc, .kcopy_more ! otherwise go to large copy nop .kcopy_2: btst 3, %o3 ! bz,pt %ncc, .kcopy_4 ! check for word alignment nop sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_2)], %o3 tst %o3 bz,pn %icc, .kcopy_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .kcopy_small ! go to small copy nop ba,pt %ncc, .kcopy_more ! otherwise go to large copy nop .kcopy_4: ! already checked longword, must be word aligned sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_4)], %o3 tst %o3 bz,pn %icc, .kcopy_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .kcopy_small ! go to small copy nop ba,pt %ncc, .kcopy_more ! otherwise go to large copy nop .kcopy_8: sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_8)], %o3 tst %o3 bz,pn %icc, .kcopy_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .kcopy_small ! go to small copy nop ba,pt %ncc, .kcopy_more ! otherwise go to large copy nop .kcopy_small: sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value or %o5, %lo(.sm_copyerr), %o5 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler membar #Sync ! sync error barrier ba,pt %ncc, .sm_do_copy ! common code stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault .kcopy_more: save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp sethi %hi(.copyerr), %l7 ! copyerr is lofault value or %l7, %lo(.copyerr), %l7 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler membar #Sync ! sync error barrier ba,pt %ncc, .do_copy ! common code stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault /* * We got here because of a fault during bcopy_more, called from kcopy or bcopy. * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. */ .copyerr: set .copyerr2, %l0 membar #Sync ! sync error barrier stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault btst FPUSED_FLAG, %l6 bz %ncc, 1f and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr wr %o2, 0, %gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz,pt %icc, 4f nop BLD_FPQ1Q3_FROMSTACK(%o2) ba,pt %ncc, 1f wr %o3, 0, %fprs ! restore fprs 4: FZEROQ1Q3 wr %o3, 0, %fprs ! restore fprs ! ! Need to cater for the different expectations of kcopy ! and bcopy. kcopy will *always* set a t_lofault handler ! If it fires, we're expected to just return the error code ! and *not* to invoke any existing error handler. As far as ! bcopy is concerned, we only set t_lofault if there was an ! existing lofault handler. In that case we're expected to ! invoke the previously existing handler after resetting the ! t_lofault value. ! 1: andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off membar #Sync ! sync error barrier stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault FP_ALLOWMIGRATE(5, 6) btst TRAMP_FLAG, %l0 bnz,pn %ncc, 3f nop ret restore %g1, 0, %o0 3: ! ! We're here via bcopy. There *must* have been an error handler ! in place otherwise we would have died a nasty death already. ! jmp %l6 ! goto real handler restore %g0, 0, %o0 ! dispose of copy window /* * We got here because of a fault in .copyerr. We can't safely restore fp * state, so we panic. */ fp_panic_msg: .asciz "Unable to restore fp state after copy operation" .align 4 .copyerr2: set fp_panic_msg, %o0 call panic nop /* * We got here because of a fault during a small kcopy or bcopy. * No floating point registers are used by the small copies. * Errno value is in %g1. */ .sm_copyerr: 1: btst TRAMP_FLAG, %o4 membar #Sync andn %o4, TRAMP_FLAG, %o4 bnz,pn %ncc, 3f stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g1, %o0 3: jmp %o4 ! goto real handler mov %g0, %o0 ! SET_SIZE(kcopy) /* * Copy a block of storage - must not overlap (from + len <= to). * Registers: l6 - saved t_lofault * (for short copies, o4 - saved t_lofault) * * Copy a page of memory. * Assumes double word alignment and a count >= 256. */ ENTRY(bcopy) cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case bleu,pt %ncc, .bcopy_small ! go to larger cases xor %o0, %o1, %o3 ! are src, dst alignable? btst 7, %o3 ! bz,pt %ncc, .bcopy_8 ! check for longword alignment nop btst 1, %o3 ! bz,pt %ncc, .bcopy_2 ! check for half-word nop sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_1)], %o3 tst %o3 bz,pn %icc, .bcopy_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .bcopy_small ! go to small copy nop ba,pt %ncc, .bcopy_more ! otherwise go to large copy nop .bcopy_2: btst 3, %o3 ! bz,pt %ncc, .bcopy_4 ! check for word alignment nop sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_2)], %o3 tst %o3 bz,pn %icc, .bcopy_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .bcopy_small ! go to small copy nop ba,pt %ncc, .bcopy_more ! otherwise go to large copy nop .bcopy_4: ! already checked longword, must be word aligned sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_4)], %o3 tst %o3 bz,pn %icc, .bcopy_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .bcopy_small ! go to small copy nop ba,pt %ncc, .bcopy_more ! otherwise go to large copy nop .bcopy_8: sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_8)], %o3 tst %o3 bz,pn %icc, .bcopy_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .bcopy_small ! go to small copy nop ba,pt %ncc, .bcopy_more ! otherwise go to large copy nop .align 16 .bcopy_small: ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault tst %o4 bz,pt %icc, .sm_do_copy nop sethi %hi(.sm_copyerr), %o5 or %o5, %lo(.sm_copyerr), %o5 membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector or %o4, TRAMP_FLAG, %o4 ! error should trampoline .sm_do_copy: cmp %o2, SHORTCOPY ! check for really short case bleu,pt %ncc, .bc_sm_left ! cmp %o2, CHKSIZE ! check for medium length cases bgu,pn %ncc, .bc_med ! or %o0, %o1, %o3 ! prepare alignment check andcc %o3, 0x3, %g0 ! test for alignment bz,pt %ncc, .bc_sm_word ! branch to word aligned case .bc_sm_movebytes: sub %o2, 3, %o2 ! adjust count to allow cc zero test .bc_sm_notalign4: ldub [%o0], %o3 ! read byte stb %o3, [%o1] ! write byte subcc %o2, 4, %o2 ! reduce count by 4 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes add %o0, 4, %o0 ! advance SRC by 4 stb %o3, [%o1 + 1] ldub [%o0 - 2], %o3 add %o1, 4, %o1 ! advance DST by 4 stb %o3, [%o1 - 2] ldub [%o0 - 1], %o3 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain stb %o3, [%o1 - 1] add %o2, 3, %o2 ! restore count .bc_sm_left: tst %o2 bz,pt %ncc, .bc_sm_exit ! check for zero length deccc %o2 ! reduce count for cc test ldub [%o0], %o3 ! move one byte bz,pt %ncc, .bc_sm_exit stb %o3, [%o1] ldub [%o0 + 1], %o3 ! move another byte deccc %o2 ! check for more bz,pt %ncc, .bc_sm_exit stb %o3, [%o1 + 1] ldub [%o0 + 2], %o3 ! move final byte stb %o3, [%o1 + 2] membar #Sync ! sync error barrier andn %o4, TRAMP_FLAG, %o4 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return 0 .align 16 nop ! instruction alignment ! see discussion at start of file .bc_sm_words: lduw [%o0], %o3 ! read word .bc_sm_wordx: subcc %o2, 8, %o2 ! update count stw %o3, [%o1] ! write word add %o0, 8, %o0 ! update SRC lduw [%o0 - 4], %o3 ! read word add %o1, 8, %o1 ! update DST bgt,pt %ncc, .bc_sm_words ! loop til done stw %o3, [%o1 - 4] ! write word addcc %o2, 7, %o2 ! restore count bz,pt %ncc, .bc_sm_exit deccc %o2 bz,pt %ncc, .bc_sm_byte .bc_sm_half: subcc %o2, 2, %o2 ! reduce count by 2 add %o0, 2, %o0 ! advance SRC by 2 lduh [%o0 - 2], %o3 ! read half word add %o1, 2, %o1 ! advance DST by 2 bgt,pt %ncc, .bc_sm_half ! loop til done sth %o3, [%o1 - 2] ! write half word addcc %o2, 1, %o2 ! restore count bz,pt %ncc, .bc_sm_exit nop .bc_sm_byte: ldub [%o0], %o3 stb %o3, [%o1] membar #Sync ! sync error barrier andn %o4, TRAMP_FLAG, %o4 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return 0 .bc_sm_word: subcc %o2, 4, %o2 ! update count bgt,pt %ncc, .bc_sm_wordx lduw [%o0], %o3 ! read word addcc %o2, 3, %o2 ! restore count bz,pt %ncc, .bc_sm_exit stw %o3, [%o1] ! write word deccc %o2 ! reduce count for cc test ldub [%o0 + 4], %o3 ! load one byte bz,pt %ncc, .bc_sm_exit stb %o3, [%o1 + 4] ! store one byte ldub [%o0 + 5], %o3 ! load second byte deccc %o2 bz,pt %ncc, .bc_sm_exit stb %o3, [%o1 + 5] ! store second byte ldub [%o0 + 6], %o3 ! load third byte stb %o3, [%o1 + 6] ! store third byte .bc_sm_exit: membar #Sync ! sync error barrier andn %o4, TRAMP_FLAG, %o4 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return 0 .align 16 .bc_med: xor %o0, %o1, %o3 ! setup alignment check btst 1, %o3 bnz,pt %ncc, .bc_sm_movebytes ! unaligned nop btst 3, %o3 bnz,pt %ncc, .bc_med_half ! halfword aligned nop btst 7, %o3 bnz,pt %ncc, .bc_med_word ! word aligned nop .bc_med_long: btst 3, %o0 ! check for bz,pt %ncc, .bc_med_long1 ! word alignment nop .bc_med_long0: ldub [%o0], %o3 ! load one byte inc %o0 stb %o3,[%o1] ! store byte inc %o1 btst 3, %o0 bnz,pt %ncc, .bc_med_long0 dec %o2 .bc_med_long1: ! word aligned btst 7, %o0 ! check for long word bz,pt %ncc, .bc_med_long2 nop lduw [%o0], %o3 ! load word add %o0, 4, %o0 ! advance SRC by 4 stw %o3, [%o1] ! store word add %o1, 4, %o1 ! advance DST by 4 sub %o2, 4, %o2 ! reduce count by 4 ! ! Now long word aligned and have at least 32 bytes to move ! .bc_med_long2: sub %o2, 31, %o2 ! adjust count to allow cc zero test .bc_med_lmove: ldx [%o0], %o3 ! read long word stx %o3, [%o1] ! write long word subcc %o2, 32, %o2 ! reduce count by 32 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words add %o0, 32, %o0 ! advance SRC by 32 stx %o3, [%o1 + 8] ldx [%o0 - 16], %o3 add %o1, 32, %o1 ! advance DST by 32 stx %o3, [%o1 - 16] ldx [%o0 - 8], %o3 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left stx %o3, [%o1 - 8] addcc %o2, 24, %o2 ! restore count to long word offset ble,pt %ncc, .bc_med_lextra ! check for more long words to move nop .bc_med_lword: ldx [%o0], %o3 ! read long word subcc %o2, 8, %o2 ! reduce count by 8 stx %o3, [%o1] ! write long word add %o0, 8, %o0 ! advance SRC by 8 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left add %o1, 8, %o1 ! advance DST by 8 .bc_med_lextra: addcc %o2, 7, %o2 ! restore rest of count bz,pt %ncc, .bc_sm_exit ! if zero, then done deccc %o2 bz,pt %ncc, .bc_sm_byte nop ba,pt %ncc, .bc_sm_half nop .align 16 .bc_med_word: btst 3, %o0 ! check for bz,pt %ncc, .bc_med_word1 ! word alignment nop .bc_med_word0: ldub [%o0], %o3 ! load one byte inc %o0 stb %o3,[%o1] ! store byte inc %o1 btst 3, %o0 bnz,pt %ncc, .bc_med_word0 dec %o2 ! ! Now word aligned and have at least 36 bytes to move ! .bc_med_word1: sub %o2, 15, %o2 ! adjust count to allow cc zero test .bc_med_wmove: lduw [%o0], %o3 ! read word stw %o3, [%o1] ! write word subcc %o2, 16, %o2 ! reduce count by 16 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words add %o0, 16, %o0 ! advance SRC by 16 stw %o3, [%o1 + 4] lduw [%o0 - 8], %o3 add %o1, 16, %o1 ! advance DST by 16 stw %o3, [%o1 - 8] lduw [%o0 - 4], %o3 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left stw %o3, [%o1 - 4] addcc %o2, 12, %o2 ! restore count to word offset ble,pt %ncc, .bc_med_wextra ! check for more words to move nop .bc_med_word2: lduw [%o0], %o3 ! read word subcc %o2, 4, %o2 ! reduce count by 4 stw %o3, [%o1] ! write word add %o0, 4, %o0 ! advance SRC by 4 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left add %o1, 4, %o1 ! advance DST by 4 .bc_med_wextra: addcc %o2, 3, %o2 ! restore rest of count bz,pt %ncc, .bc_sm_exit ! if zero, then done deccc %o2 bz,pt %ncc, .bc_sm_byte nop ba,pt %ncc, .bc_sm_half nop .align 16 .bc_med_half: btst 1, %o0 ! check for bz,pt %ncc, .bc_med_half1 ! half word alignment nop ldub [%o0], %o3 ! load one byte inc %o0 stb %o3,[%o1] ! store byte inc %o1 dec %o2 ! ! Now half word aligned and have at least 38 bytes to move ! .bc_med_half1: sub %o2, 7, %o2 ! adjust count to allow cc zero test .bc_med_hmove: lduh [%o0], %o3 ! read half word sth %o3, [%o1] ! write half word subcc %o2, 8, %o2 ! reduce count by 8 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords add %o0, 8, %o0 ! advance SRC by 8 sth %o3, [%o1 + 2] lduh [%o0 - 4], %o3 add %o1, 8, %o1 ! advance DST by 8 sth %o3, [%o1 - 4] lduh [%o0 - 2], %o3 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left sth %o3, [%o1 - 2] addcc %o2, 7, %o2 ! restore count bz,pt %ncc, .bc_sm_exit deccc %o2 bz,pt %ncc, .bc_sm_byte nop ba,pt %ncc, .bc_sm_half nop SET_SIZE(bcopy) /* * The _more entry points are not intended to be used directly by * any caller from outside this file. They are provided to allow * profiling and dtrace of the portions of the copy code that uses * the floating point registers. * This entry is particularly important as DTRACE (at least as of * 4/2004) does not support leaf functions. */ ENTRY(bcopy_more) .bcopy_more: save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault tst %l6 bz,pt %ncc, .do_copy nop sethi %hi(.copyerr), %o2 or %o2, %lo(.copyerr), %o2 membar #Sync ! sync error barrier stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector ! ! We've already captured whether t_lofault was zero on entry. ! We need to mark ourselves as being from bcopy since both ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set ! and the saved lofault was zero, we won't reset lofault on ! returning. ! or %l6, TRAMP_FLAG, %l6 /* * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes * Also, use of FP registers has been tested to be enabled */ .do_copy: FP_NOMIGRATE(6, 7) rd %fprs, %o2 ! check for unused fp st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs btst FPRS_FEF, %o2 bz,a,pt %icc, .do_blockcopy wr %g0, FPRS_FEF, %fprs BST_FPQ1Q3_TOSTACK(%o2) .do_blockcopy: rd %gsr, %o2 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr or %l6, FPUSED_FLAG, %l6 #define REALSRC %i0 #define DST %i1 #define CNT %i2 #define SRC %i3 #define TMP %i5 andcc DST, VIS_BLOCKSIZE - 1, TMP bz,pt %ncc, 2f neg TMP add TMP, VIS_BLOCKSIZE, TMP ! TMP = bytes required to align DST on FP_BLOCK boundary ! Using SRC as a tmp here cmp TMP, 3 bleu,pt %ncc, 1f sub CNT,TMP,CNT ! adjust main count sub TMP, 3, TMP ! adjust for end of loop test .bc_blkalign: ldub [REALSRC], SRC ! move 4 bytes per loop iteration stb SRC, [DST] subcc TMP, 4, TMP ldub [REALSRC + 1], SRC add REALSRC, 4, REALSRC stb SRC, [DST + 1] ldub [REALSRC - 2], SRC add DST, 4, DST stb SRC, [DST - 2] ldub [REALSRC - 1], SRC bgu,pt %ncc, .bc_blkalign stb SRC, [DST - 1] addcc TMP, 3, TMP ! restore count adjustment bz,pt %ncc, 2f ! no bytes left? nop 1: ldub [REALSRC], SRC inc REALSRC inc DST deccc TMP bgu %ncc, 1b stb SRC, [DST - 1] 2: andn REALSRC, 0x7, SRC alignaddr REALSRC, %g0, %g0 ! SRC - 8-byte aligned ! DST - 64-byte aligned prefetch [SRC], #one_read prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read ldd [SRC], %f0 #if CHEETAH_PREFETCH > 4 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read #endif ldd [SRC + 0x08], %f2 #if CHEETAH_PREFETCH > 5 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read #endif ldd [SRC + 0x10], %f4 #if CHEETAH_PREFETCH > 6 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read #endif faligndata %f0, %f2, %f32 ldd [SRC + 0x18], %f6 #if CHEETAH_PREFETCH > 7 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read #endif faligndata %f2, %f4, %f34 ldd [SRC + 0x20], %f8 faligndata %f4, %f6, %f36 ldd [SRC + 0x28], %f10 faligndata %f6, %f8, %f38 ldd [SRC + 0x30], %f12 faligndata %f8, %f10, %f40 ldd [SRC + 0x38], %f14 faligndata %f10, %f12, %f42 ldd [SRC + VIS_BLOCKSIZE], %f0 sub CNT, VIS_BLOCKSIZE, CNT add SRC, VIS_BLOCKSIZE, SRC add REALSRC, VIS_BLOCKSIZE, REALSRC ba,a,pt %ncc, 1f nop .align 16 1: ldd [SRC + 0x08], %f2 faligndata %f12, %f14, %f44 ldd [SRC + 0x10], %f4 faligndata %f14, %f0, %f46 stda %f32, [DST]ASI_BLK_P ldd [SRC + 0x18], %f6 faligndata %f0, %f2, %f32 ldd [SRC + 0x20], %f8 faligndata %f2, %f4, %f34 ldd [SRC + 0x28], %f10 faligndata %f4, %f6, %f36 ldd [SRC + 0x30], %f12 faligndata %f6, %f8, %f38 ldd [SRC + 0x38], %f14 faligndata %f8, %f10, %f40 sub CNT, VIS_BLOCKSIZE, CNT ldd [SRC + VIS_BLOCKSIZE], %f0 faligndata %f10, %f12, %f42 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read add DST, VIS_BLOCKSIZE, DST prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read add REALSRC, VIS_BLOCKSIZE, REALSRC cmp CNT, VIS_BLOCKSIZE + 8 bgu,pt %ncc, 1b add SRC, VIS_BLOCKSIZE, SRC ! only if REALSRC & 0x7 is 0 cmp CNT, VIS_BLOCKSIZE bne %ncc, 3f andcc REALSRC, 0x7, %g0 bz,pt %ncc, 2f nop 3: faligndata %f12, %f14, %f44 faligndata %f14, %f0, %f46 stda %f32, [DST]ASI_BLK_P add DST, VIS_BLOCKSIZE, DST ba,pt %ncc, 3f nop 2: ldd [SRC + 0x08], %f2 fsrc1 %f12, %f44 ldd [SRC + 0x10], %f4 fsrc1 %f14, %f46 stda %f32, [DST]ASI_BLK_P ldd [SRC + 0x18], %f6 fsrc1 %f0, %f32 ldd [SRC + 0x20], %f8 fsrc1 %f2, %f34 ldd [SRC + 0x28], %f10 fsrc1 %f4, %f36 ldd [SRC + 0x30], %f12 fsrc1 %f6, %f38 ldd [SRC + 0x38], %f14 fsrc1 %f8, %f40 sub CNT, VIS_BLOCKSIZE, CNT add DST, VIS_BLOCKSIZE, DST add SRC, VIS_BLOCKSIZE, SRC add REALSRC, VIS_BLOCKSIZE, REALSRC fsrc1 %f10, %f42 fsrc1 %f12, %f44 fsrc1 %f14, %f46 stda %f32, [DST]ASI_BLK_P add DST, VIS_BLOCKSIZE, DST ba,a,pt %ncc, .bcb_exit nop 3: tst CNT bz,a,pt %ncc, .bcb_exit nop 5: ldub [REALSRC], TMP inc REALSRC inc DST deccc CNT bgu %ncc, 5b stb TMP, [DST - 1] .bcb_exit: membar #Sync FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8) FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9) FPRAS_CHECK(FPRAS_BCOPY, %l5, 9) ! outputs lost ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr wr %o2, 0, %gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz,pt %icc, 4f nop BLD_FPQ1Q3_FROMSTACK(%o2) ba,pt %ncc, 2f wr %o3, 0, %fprs ! restore fprs 4: FZEROQ1Q3 wr %o3, 0, %fprs ! restore fprs 2: membar #Sync ! sync error barrier andn %l6, MASK_FLAGS, %l6 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault FP_ALLOWMIGRATE(5, 6) ret restore %g0, 0, %o0 SET_SIZE(bcopy_more) /* * Block copy with possibly overlapped operands. */ ENTRY(ovbcopy) tst %o2 ! check count bgu,a %ncc, 1f ! nothing to do or bad arguments subcc %o0, %o1, %o3 ! difference of from and to address retl ! return nop 1: bneg,a %ncc, 2f neg %o3 ! if < 0, make it positive 2: cmp %o2, %o3 ! cmp size and abs(from - to) bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, .empty ! no overlap cmp %o0, %o1 ! compare from and to addresses blu %ncc, .ov_bkwd ! if from < to, copy backwards nop ! ! Copy forwards. ! .ov_fwd: ldub [%o0], %o3 ! read from address inc %o0 ! inc from address stb %o3, [%o1] ! write to address deccc %o2 ! dec count bgu %ncc, .ov_fwd ! loop till done inc %o1 ! inc to address retl ! return nop ! ! Copy backwards. ! .ov_bkwd: deccc %o2 ! dec count ldub [%o0 + %o2], %o3 ! get byte at end of src bgu %ncc, .ov_bkwd ! loop till done stb %o3, [%o1 + %o2] ! delay slot, store at end of dst retl ! return nop SET_SIZE(ovbcopy) /* * hwblkpagecopy() * * Copies exactly one page. This routine assumes the caller (ppcopy) * has already disabled kernel preemption and has checked * use_hw_bcopy. Preventing preemption also prevents cpu migration. */ ENTRY(hwblkpagecopy) ! get another window w/space for three aligned blocks of saved fpregs save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp ! %i0 - source address (arg) ! %i1 - destination address (arg) ! %i2 - length of region (not arg) ! %l0 - saved fprs ! %l1 - pointer to saved fpregs rd %fprs, %l0 ! check for unused fp btst FPRS_FEF, %l0 bz,a,pt %icc, 1f wr %g0, FPRS_FEF, %fprs BST_FPQ1Q3_TOSTACK(%l1) 1: set PAGESIZE, CNT mov REALSRC, SRC prefetch [SRC], #one_read prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read ldd [SRC], %f0 #if CHEETAH_PREFETCH > 4 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read #endif ldd [SRC + 0x08], %f2 #if CHEETAH_PREFETCH > 5 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read #endif ldd [SRC + 0x10], %f4 #if CHEETAH_PREFETCH > 6 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read #endif fsrc1 %f0, %f32 ldd [SRC + 0x18], %f6 #if CHEETAH_PREFETCH > 7 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read #endif fsrc1 %f2, %f34 ldd [SRC + 0x20], %f8 fsrc1 %f4, %f36 ldd [SRC + 0x28], %f10 fsrc1 %f6, %f38 ldd [SRC + 0x30], %f12 fsrc1 %f8, %f40 ldd [SRC + 0x38], %f14 fsrc1 %f10, %f42 ldd [SRC + VIS_BLOCKSIZE], %f0 sub CNT, VIS_BLOCKSIZE, CNT add SRC, VIS_BLOCKSIZE, SRC ba,a,pt %ncc, 2f nop .align 16 2: ldd [SRC + 0x08], %f2 fsrc1 %f12, %f44 ldd [SRC + 0x10], %f4 fsrc1 %f14, %f46 stda %f32, [DST]ASI_BLK_P ldd [SRC + 0x18], %f6 fsrc1 %f0, %f32 ldd [SRC + 0x20], %f8 fsrc1 %f2, %f34 ldd [SRC + 0x28], %f10 fsrc1 %f4, %f36 ldd [SRC + 0x30], %f12 fsrc1 %f6, %f38 ldd [SRC + 0x38], %f14 fsrc1 %f8, %f40 ldd [SRC + VIS_BLOCKSIZE], %f0 fsrc1 %f10, %f42 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read sub CNT, VIS_BLOCKSIZE, CNT add DST, VIS_BLOCKSIZE, DST cmp CNT, VIS_BLOCKSIZE + 8 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read bgu,pt %ncc, 2b add SRC, VIS_BLOCKSIZE, SRC ! trailing block ldd [SRC + 0x08], %f2 fsrc1 %f12, %f44 ldd [SRC + 0x10], %f4 fsrc1 %f14, %f46 stda %f32, [DST]ASI_BLK_P ldd [SRC + 0x18], %f6 fsrc1 %f0, %f32 ldd [SRC + 0x20], %f8 fsrc1 %f2, %f34 ldd [SRC + 0x28], %f10 fsrc1 %f4, %f36 ldd [SRC + 0x30], %f12 fsrc1 %f6, %f38 ldd [SRC + 0x38], %f14 fsrc1 %f8, %f40 sub CNT, VIS_BLOCKSIZE, CNT add DST, VIS_BLOCKSIZE, DST add SRC, VIS_BLOCKSIZE, SRC fsrc1 %f10, %f42 fsrc1 %f12, %f44 fsrc1 %f14, %f46 stda %f32, [DST]ASI_BLK_P membar #Sync FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8) FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9) FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9) ! lose outputs btst FPRS_FEF, %l0 bz,pt %icc, 2f nop BLD_FPQ1Q3_FROMSTACK(%l3) ba 3f nop 2: FZEROQ1Q3 3: wr %l0, 0, %fprs ! restore fprs ret restore %g0, 0, %o0 SET_SIZE(hwblkpagecopy) /* * Transfer data to and from user space - * Note that these routines can cause faults * It is assumed that the kernel has nothing at * less than KERNELBASE in the virtual address space. * * Note that copyin(9F) and copyout(9F) are part of the * DDI/DKI which specifies that they return '-1' on "errors." * * Sigh. * * So there's two extremely similar routines - xcopyin() and xcopyout() * which return the errno that we've faithfully computed. This * allows other callers (e.g. uiomove(9F)) to work correctly. * Given that these are used pretty heavily, we expand the calling * sequences inline for all flavours (rather than making wrappers). * * There are also stub routines for xcopyout_little and xcopyin_little, * which currently are intended to handle requests of <= 16 bytes from * do_unaligned. Future enhancement to make them handle 8k pages efficiently * is left as an exercise... */ /* * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) * * General theory of operation: * * The only difference between copy{in,out} and * xcopy{in,out} is in the error handling routine they invoke * when a memory access error occurs. xcopyOP returns the errno * while copyOP returns -1 (see above). copy{in,out}_noerr set * a special flag (by oring the TRAMP_FLAG into the fault handler address) * if they are called with a fault handler already in place. That flag * causes the default handlers to trampoline to the previous handler * upon an error. * * None of the copyops routines grab a window until it's decided that * we need to do a HW block copy operation. This saves a window * spill/fill when we're called during socket ops. The typical IO * path won't cause spill/fill traps. * * This code uses a set of 4 limits for the maximum size that will * be copied given a particular input/output address alignment. * If the value for a particular limit is zero, the copy will be performed * by the plain copy loops rather than FPBLK. * * See the description of bcopy above for more details of the * data copying algorithm and the default limits. * */ /* * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). */ /* * We save the arguments in the following registers in case of a fault: * kaddr - %l1 * uaddr - %l2 * count - %l3 */ #define SAVE_SRC %l1 #define SAVE_DST %l2 #define SAVE_COUNT %l3 #define SM_SAVE_SRC %g4 #define SM_SAVE_DST %g5 #define SM_SAVE_COUNT %o5 #define ERRNO %l5 #define REAL_LOFAULT %l4 /* * Generic copyio fault handler. This is the first line of defense when a * fault occurs in (x)copyin/(x)copyout. In order for this to function * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. * This allows us to share common code for all the flavors of the copy * operations, including the _noerr versions. * * Note that this function will restore the original input parameters before * calling REAL_LOFAULT. So the real handler can vector to the appropriate * member of the t_copyop structure, if needed. */ ENTRY(copyio_fault) membar #Sync mov %g1,ERRNO ! save errno in ERRNO btst FPUSED_FLAG, %l6 bz %ncc, 1f nop ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 wr %o2, 0, %gsr ! restore gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz,pt %icc, 4f nop BLD_FPQ2Q4_FROMSTACK(%o2) ba,pt %ncc, 1f wr %o3, 0, %fprs ! restore fprs 4: FZEROQ2Q4 wr %o3, 0, %fprs ! restore fprs 1: andn %l6, FPUSED_FLAG, %l6 membar #Sync stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault FP_ALLOWMIGRATE(5, 6) mov SAVE_SRC, %i0 mov SAVE_DST, %i1 jmp REAL_LOFAULT mov SAVE_COUNT, %i2 SET_SIZE(copyio_fault) ENTRY(copyout) cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case bleu,pt %ncc, .copyout_small ! go to larger cases xor %o0, %o1, %o3 ! are src, dst alignable? btst 7, %o3 ! bz,pt %ncc, .copyout_8 ! check for longword alignment nop btst 1, %o3 ! bz,pt %ncc, .copyout_2 ! check for half-word nop sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_1)], %o3 tst %o3 bz,pn %icc, .copyout_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyout_small ! go to small copy nop ba,pt %ncc, .copyout_more ! otherwise go to large copy nop .copyout_2: btst 3, %o3 ! bz,pt %ncc, .copyout_4 ! check for word alignment nop sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_2)], %o3 tst %o3 bz,pn %icc, .copyout_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyout_small ! go to small copy nop ba,pt %ncc, .copyout_more ! otherwise go to large copy nop .copyout_4: ! already checked longword, must be word aligned sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_4)], %o3 tst %o3 bz,pn %icc, .copyout_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyout_small ! go to small copy nop ba,pt %ncc, .copyout_more ! otherwise go to large copy nop .copyout_8: sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_8)], %o3 tst %o3 bz,pn %icc, .copyout_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyout_small ! go to small copy nop ba,pt %ncc, .copyout_more ! otherwise go to large copy nop .align 16 nop ! instruction alignment ! see discussion at start of file .copyout_small: sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault or %o5, %lo(.sm_copyout_err), %o5 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault .sm_do_copyout: mov %o0, SM_SAVE_SRC mov %o1, SM_SAVE_DST cmp %o2, SHORTCOPY ! check for really short case bleu,pt %ncc, .co_sm_left ! mov %o2, SM_SAVE_COUNT cmp %o2, CHKSIZE ! check for medium length cases bgu,pn %ncc, .co_med ! or %o0, %o1, %o3 ! prepare alignment check andcc %o3, 0x3, %g0 ! test for alignment bz,pt %ncc, .co_sm_word ! branch to word aligned case .co_sm_movebytes: sub %o2, 3, %o2 ! adjust count to allow cc zero test .co_sm_notalign4: ldub [%o0], %o3 ! read byte subcc %o2, 4, %o2 ! reduce count by 4 stba %o3, [%o1]ASI_USER ! write byte inc %o1 ! advance DST by 1 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes add %o0, 4, %o0 ! advance SRC by 4 stba %o3, [%o1]ASI_USER inc %o1 ! advance DST by 1 ldub [%o0 - 2], %o3 stba %o3, [%o1]ASI_USER inc %o1 ! advance DST by 1 ldub [%o0 - 1], %o3 stba %o3, [%o1]ASI_USER bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain inc %o1 ! advance DST by 1 add %o2, 3, %o2 ! restore count .co_sm_left: tst %o2 bz,pt %ncc, .co_sm_exit ! check for zero length nop ldub [%o0], %o3 ! load one byte deccc %o2 ! reduce count for cc test bz,pt %ncc, .co_sm_exit stba %o3,[%o1]ASI_USER ! store one byte ldub [%o0 + 1], %o3 ! load second byte deccc %o2 inc %o1 bz,pt %ncc, .co_sm_exit stba %o3,[%o1]ASI_USER ! store second byte ldub [%o0 + 2], %o3 ! load third byte inc %o1 stba %o3,[%o1]ASI_USER ! store third byte membar #Sync ! sync error barrier stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return 0 .align 16 .co_sm_words: lduw [%o0], %o3 ! read word .co_sm_wordx: subcc %o2, 8, %o2 ! update count stwa %o3, [%o1]ASI_USER ! write word add %o0, 8, %o0 ! update SRC lduw [%o0 - 4], %o3 ! read word add %o1, 4, %o1 ! update DST stwa %o3, [%o1]ASI_USER ! write word bgt,pt %ncc, .co_sm_words ! loop til done add %o1, 4, %o1 ! update DST addcc %o2, 7, %o2 ! restore count bz,pt %ncc, .co_sm_exit nop deccc %o2 bz,pt %ncc, .co_sm_byte .co_sm_half: subcc %o2, 2, %o2 ! reduce count by 2 lduh [%o0], %o3 ! read half word add %o0, 2, %o0 ! advance SRC by 2 stha %o3, [%o1]ASI_USER ! write half word bgt,pt %ncc, .co_sm_half ! loop til done add %o1, 2, %o1 ! advance DST by 2 addcc %o2, 1, %o2 ! restore count bz,pt %ncc, .co_sm_exit nop .co_sm_byte: ldub [%o0], %o3 stba %o3, [%o1]ASI_USER membar #Sync ! sync error barrier stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return 0 .align 16 .co_sm_word: subcc %o2, 4, %o2 ! update count bgt,pt %ncc, .co_sm_wordx lduw [%o0], %o3 ! read word addcc %o2, 3, %o2 ! restore count bz,pt %ncc, .co_sm_exit stwa %o3, [%o1]ASI_USER ! write word deccc %o2 ! reduce count for cc test ldub [%o0 + 4], %o3 ! load one byte add %o1, 4, %o1 bz,pt %ncc, .co_sm_exit stba %o3, [%o1]ASI_USER ! store one byte ldub [%o0 + 5], %o3 ! load second byte deccc %o2 inc %o1 bz,pt %ncc, .co_sm_exit stba %o3, [%o1]ASI_USER ! store second byte ldub [%o0 + 6], %o3 ! load third byte inc %o1 stba %o3, [%o1]ASI_USER ! store third byte .co_sm_exit: membar #Sync ! sync error barrier stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return 0 .align 16 .co_med: xor %o0, %o1, %o3 ! setup alignment check btst 1, %o3 bnz,pt %ncc, .co_sm_movebytes ! unaligned nop btst 3, %o3 bnz,pt %ncc, .co_med_half ! halfword aligned nop btst 7, %o3 bnz,pt %ncc, .co_med_word ! word aligned nop .co_med_long: btst 3, %o0 ! check for bz,pt %ncc, .co_med_long1 ! word alignment nop .co_med_long0: ldub [%o0], %o3 ! load one byte inc %o0 stba %o3,[%o1]ASI_USER ! store byte inc %o1 btst 3, %o0 bnz,pt %ncc, .co_med_long0 dec %o2 .co_med_long1: ! word aligned btst 7, %o0 ! check for long word bz,pt %ncc, .co_med_long2 nop lduw [%o0], %o3 ! load word add %o0, 4, %o0 ! advance SRC by 4 stwa %o3, [%o1]ASI_USER ! store word add %o1, 4, %o1 ! advance DST by 4 sub %o2, 4, %o2 ! reduce count by 4 ! ! Now long word aligned and have at least 32 bytes to move ! .co_med_long2: sub %o2, 31, %o2 ! adjust count to allow cc zero test sub %o1, 8, %o1 ! adjust pointer to allow store in ! branch delay slot instead of add .co_med_lmove: add %o1, 8, %o1 ! advance DST by 8 ldx [%o0], %o3 ! read long word subcc %o2, 32, %o2 ! reduce count by 32 stxa %o3, [%o1]ASI_USER ! write long word add %o1, 8, %o1 ! advance DST by 8 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words add %o0, 32, %o0 ! advance SRC by 32 stxa %o3, [%o1]ASI_USER ldx [%o0 - 16], %o3 add %o1, 8, %o1 ! advance DST by 8 stxa %o3, [%o1]ASI_USER ldx [%o0 - 8], %o3 add %o1, 8, %o1 ! advance DST by 8 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left stxa %o3, [%o1]ASI_USER add %o1, 8, %o1 ! advance DST by 8 addcc %o2, 24, %o2 ! restore count to long word offset ble,pt %ncc, .co_med_lextra ! check for more long words to move nop .co_med_lword: ldx [%o0], %o3 ! read long word subcc %o2, 8, %o2 ! reduce count by 8 stxa %o3, [%o1]ASI_USER ! write long word add %o0, 8, %o0 ! advance SRC by 8 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left add %o1, 8, %o1 ! advance DST by 8 .co_med_lextra: addcc %o2, 7, %o2 ! restore rest of count bz,pt %ncc, .co_sm_exit ! if zero, then done deccc %o2 bz,pt %ncc, .co_sm_byte nop ba,pt %ncc, .co_sm_half nop .align 16 nop ! instruction alignment ! see discussion at start of file .co_med_word: btst 3, %o0 ! check for bz,pt %ncc, .co_med_word1 ! word alignment nop .co_med_word0: ldub [%o0], %o3 ! load one byte inc %o0 stba %o3,[%o1]ASI_USER ! store byte inc %o1 btst 3, %o0 bnz,pt %ncc, .co_med_word0 dec %o2 ! ! Now word aligned and have at least 36 bytes to move ! .co_med_word1: sub %o2, 15, %o2 ! adjust count to allow cc zero test .co_med_wmove: lduw [%o0], %o3 ! read word subcc %o2, 16, %o2 ! reduce count by 16 stwa %o3, [%o1]ASI_USER ! write word add %o1, 4, %o1 ! advance DST by 4 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words add %o0, 16, %o0 ! advance SRC by 16 stwa %o3, [%o1]ASI_USER add %o1, 4, %o1 ! advance DST by 4 lduw [%o0 - 8], %o3 stwa %o3, [%o1]ASI_USER add %o1, 4, %o1 ! advance DST by 4 lduw [%o0 - 4], %o3 stwa %o3, [%o1]ASI_USER bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left add %o1, 4, %o1 ! advance DST by 4 addcc %o2, 12, %o2 ! restore count to word offset ble,pt %ncc, .co_med_wextra ! check for more words to move nop .co_med_word2: lduw [%o0], %o3 ! read word subcc %o2, 4, %o2 ! reduce count by 4 stwa %o3, [%o1]ASI_USER ! write word add %o0, 4, %o0 ! advance SRC by 4 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left add %o1, 4, %o1 ! advance DST by 4 .co_med_wextra: addcc %o2, 3, %o2 ! restore rest of count bz,pt %ncc, .co_sm_exit ! if zero, then done deccc %o2 bz,pt %ncc, .co_sm_byte nop ba,pt %ncc, .co_sm_half nop .align 16 nop ! instruction alignment nop ! see discussion at start of file nop .co_med_half: btst 1, %o0 ! check for bz,pt %ncc, .co_med_half1 ! half word alignment nop ldub [%o0], %o3 ! load one byte inc %o0 stba %o3,[%o1]ASI_USER ! store byte inc %o1 dec %o2 ! ! Now half word aligned and have at least 38 bytes to move ! .co_med_half1: sub %o2, 7, %o2 ! adjust count to allow cc zero test .co_med_hmove: lduh [%o0], %o3 ! read half word subcc %o2, 8, %o2 ! reduce count by 8 stha %o3, [%o1]ASI_USER ! write half word add %o1, 2, %o1 ! advance DST by 2 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords add %o0, 8, %o0 ! advance SRC by 8 stha %o3, [%o1]ASI_USER add %o1, 2, %o1 ! advance DST by 2 lduh [%o0 - 4], %o3 stha %o3, [%o1]ASI_USER add %o1, 2, %o1 ! advance DST by 2 lduh [%o0 - 2], %o3 stha %o3, [%o1]ASI_USER bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left add %o1, 2, %o1 ! advance DST by 2 addcc %o2, 7, %o2 ! restore count bz,pt %ncc, .co_sm_exit deccc %o2 bz,pt %ncc, .co_sm_byte nop ba,pt %ncc, .co_sm_half nop /* * We got here because of a fault during short copyout. * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). */ .sm_copyout_err: membar #Sync stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault mov SM_SAVE_SRC, %o0 mov SM_SAVE_DST, %o1 mov SM_SAVE_COUNT, %o2 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler tst %o3 bz,pt %ncc, 3f ! if not, return error nop ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with jmp %o5 ! original arguments nop 3: retl or %g0, -1, %o0 ! return error value SET_SIZE(copyout) /* * The _more entry points are not intended to be used directly by * any caller from outside this file. They are provided to allow * profiling and dtrace of the portions of the copy code that uses * the floating point registers. * This entry is particularly important as DTRACE (at least as of * 4/2004) does not support leaf functions. */ ENTRY(copyout_more) .copyout_more: save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp set .copyout_err, REAL_LOFAULT /* * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes */ .do_copyout: set copyio_fault, %l7 ! .copyio_fault is lofault val ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler membar #Sync ! sync error barrier stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault mov %i0, SAVE_SRC mov %i1, SAVE_DST mov %i2, SAVE_COUNT FP_NOMIGRATE(6, 7) rd %fprs, %o2 ! check for unused fp st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs btst FPRS_FEF, %o2 bz,a,pt %icc, .do_blockcopyout wr %g0, FPRS_FEF, %fprs BST_FPQ2Q4_TOSTACK(%o2) .do_blockcopyout: rd %gsr, %o2 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr or %l6, FPUSED_FLAG, %l6 andcc DST, VIS_BLOCKSIZE - 1, TMP mov ASI_USER, %asi bz,pt %ncc, 2f neg TMP add TMP, VIS_BLOCKSIZE, TMP ! TMP = bytes required to align DST on FP_BLOCK boundary ! Using SRC as a tmp here cmp TMP, 3 bleu,pt %ncc, 1f sub CNT,TMP,CNT ! adjust main count sub TMP, 3, TMP ! adjust for end of loop test .co_blkalign: ldub [REALSRC], SRC ! move 4 bytes per loop iteration stba SRC, [DST]%asi subcc TMP, 4, TMP ldub [REALSRC + 1], SRC add REALSRC, 4, REALSRC stba SRC, [DST + 1]%asi ldub [REALSRC - 2], SRC add DST, 4, DST stba SRC, [DST - 2]%asi ldub [REALSRC - 1], SRC bgu,pt %ncc, .co_blkalign stba SRC, [DST - 1]%asi addcc TMP, 3, TMP ! restore count adjustment bz,pt %ncc, 2f ! no bytes left? nop 1: ldub [REALSRC], SRC inc REALSRC inc DST deccc TMP bgu %ncc, 1b stba SRC, [DST - 1]%asi 2: andn REALSRC, 0x7, SRC alignaddr REALSRC, %g0, %g0 ! SRC - 8-byte aligned ! DST - 64-byte aligned prefetch [SRC], #one_read prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read ldd [SRC], %f16 #if CHEETAH_PREFETCH > 4 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read #endif ldd [SRC + 0x08], %f18 #if CHEETAH_PREFETCH > 5 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read #endif ldd [SRC + 0x10], %f20 #if CHEETAH_PREFETCH > 6 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read #endif faligndata %f16, %f18, %f48 ldd [SRC + 0x18], %f22 #if CHEETAH_PREFETCH > 7 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read #endif faligndata %f18, %f20, %f50 ldd [SRC + 0x20], %f24 faligndata %f20, %f22, %f52 ldd [SRC + 0x28], %f26 faligndata %f22, %f24, %f54 ldd [SRC + 0x30], %f28 faligndata %f24, %f26, %f56 ldd [SRC + 0x38], %f30 faligndata %f26, %f28, %f58 ldd [SRC + VIS_BLOCKSIZE], %f16 sub CNT, VIS_BLOCKSIZE, CNT add SRC, VIS_BLOCKSIZE, SRC add REALSRC, VIS_BLOCKSIZE, REALSRC ba,a,pt %ncc, 1f nop .align 16 1: ldd [SRC + 0x08], %f18 faligndata %f28, %f30, %f60 ldd [SRC + 0x10], %f20 faligndata %f30, %f16, %f62 stda %f48, [DST]ASI_BLK_AIUS ldd [SRC + 0x18], %f22 faligndata %f16, %f18, %f48 ldd [SRC + 0x20], %f24 faligndata %f18, %f20, %f50 ldd [SRC + 0x28], %f26 faligndata %f20, %f22, %f52 ldd [SRC + 0x30], %f28 faligndata %f22, %f24, %f54 ldd [SRC + 0x38], %f30 faligndata %f24, %f26, %f56 sub CNT, VIS_BLOCKSIZE, CNT ldd [SRC + VIS_BLOCKSIZE], %f16 faligndata %f26, %f28, %f58 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read add DST, VIS_BLOCKSIZE, DST prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read add REALSRC, VIS_BLOCKSIZE, REALSRC cmp CNT, VIS_BLOCKSIZE + 8 bgu,pt %ncc, 1b add SRC, VIS_BLOCKSIZE, SRC ! only if REALSRC & 0x7 is 0 cmp CNT, VIS_BLOCKSIZE bne %ncc, 3f andcc REALSRC, 0x7, %g0 bz,pt %ncc, 2f nop 3: faligndata %f28, %f30, %f60 faligndata %f30, %f16, %f62 stda %f48, [DST]ASI_BLK_AIUS add DST, VIS_BLOCKSIZE, DST ba,pt %ncc, 3f nop 2: ldd [SRC + 0x08], %f18 fsrc1 %f28, %f60 ldd [SRC + 0x10], %f20 fsrc1 %f30, %f62 stda %f48, [DST]ASI_BLK_AIUS ldd [SRC + 0x18], %f22 fsrc1 %f16, %f48 ldd [SRC + 0x20], %f24 fsrc1 %f18, %f50 ldd [SRC + 0x28], %f26 fsrc1 %f20, %f52 ldd [SRC + 0x30], %f28 fsrc1 %f22, %f54 ldd [SRC + 0x38], %f30 fsrc1 %f24, %f56 sub CNT, VIS_BLOCKSIZE, CNT add DST, VIS_BLOCKSIZE, DST add SRC, VIS_BLOCKSIZE, SRC add REALSRC, VIS_BLOCKSIZE, REALSRC fsrc1 %f26, %f58 fsrc1 %f28, %f60 fsrc1 %f30, %f62 stda %f48, [DST]ASI_BLK_AIUS add DST, VIS_BLOCKSIZE, DST ba,a,pt %ncc, 4f nop 3: tst CNT bz,a %ncc, 4f nop 5: ldub [REALSRC], TMP inc REALSRC inc DST deccc CNT bgu %ncc, 5b stba TMP, [DST - 1]%asi 4: .copyout_exit: membar #Sync FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8) FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9) FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9) ! lose outputs ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 wr %o2, 0, %gsr ! restore gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz,pt %icc, 4f nop BLD_FPQ2Q4_FROMSTACK(%o2) ba,pt %ncc, 1f wr %o3, 0, %fprs ! restore fprs 4: FZEROQ2Q4 wr %o3, 0, %fprs ! restore fprs 1: membar #Sync andn %l6, FPUSED_FLAG, %l6 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault FP_ALLOWMIGRATE(5, 6) ret restore %g0, 0, %o0 /* * We got here because of a fault during copyout. * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). */ .copyout_err: ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler tst %o4 bz,pt %ncc, 2f ! if not, return error nop ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with jmp %g2 ! original arguments restore %g0, 0, %g0 ! dispose of copy window 2: ret restore %g0, -1, %o0 ! return error value SET_SIZE(copyout_more) ENTRY(xcopyout) cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case bleu,pt %ncc, .xcopyout_small ! go to larger cases xor %o0, %o1, %o3 ! are src, dst alignable? btst 7, %o3 ! bz,pt %ncc, .xcopyout_8 ! nop btst 1, %o3 ! bz,pt %ncc, .xcopyout_2 ! check for half-word nop sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_1)], %o3 tst %o3 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .xcopyout_small ! go to small copy nop ba,pt %ncc, .xcopyout_more ! otherwise go to large copy nop .xcopyout_2: btst 3, %o3 ! bz,pt %ncc, .xcopyout_4 ! check for word alignment nop sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_2)], %o3 tst %o3 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .xcopyout_small ! go to small copy nop ba,pt %ncc, .xcopyout_more ! otherwise go to large copy nop .xcopyout_4: ! already checked longword, must be word aligned sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_4)], %o3 tst %o3 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .xcopyout_small ! go to small copy nop ba,pt %ncc, .xcopyout_more ! otherwise go to large copy nop .xcopyout_8: sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_8)], %o3 tst %o3 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .xcopyout_small ! go to small copy nop ba,pt %ncc, .xcopyout_more ! otherwise go to large copy nop .xcopyout_small: sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault or %o5, %lo(.sm_xcopyout_err), %o5 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler membar #Sync ! sync error barrier ba,pt %ncc, .sm_do_copyout ! common code stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault .xcopyout_more: save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp sethi %hi(.xcopyout_err), REAL_LOFAULT ba,pt %ncc, .do_copyout ! common code or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT /* * We got here because of fault during xcopyout * Errno value is in ERRNO */ .xcopyout_err: ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler tst %o4 bz,pt %ncc, 2f ! if not, return error nop ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with jmp %g2 ! original arguments restore %g0, 0, %g0 ! dispose of copy window 2: ret restore ERRNO, 0, %o0 ! return errno value .sm_xcopyout_err: membar #Sync stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault mov SM_SAVE_SRC, %o0 mov SM_SAVE_DST, %o1 mov SM_SAVE_COUNT, %o2 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler tst %o3 bz,pt %ncc, 3f ! if not, return error nop ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with jmp %o5 ! original arguments nop 3: retl or %g1, 0, %o0 ! return errno value SET_SIZE(xcopyout) ENTRY(xcopyout_little) sethi %hi(.xcopyio_err), %o5 or %o5, %lo(.xcopyio_err), %o5 ldn [THREAD_REG + T_LOFAULT], %o4 membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] mov %o4, %o5 subcc %g0, %o2, %o3 add %o0, %o2, %o0 bz,pn %ncc, 2f ! check for zero bytes sub %o2, 1, %o4 add %o0, %o4, %o0 ! start w/last byte add %o1, %o2, %o1 ldub [%o0 + %o3], %o4 1: stba %o4, [%o1 + %o3]ASI_AIUSL inccc %o3 sub %o0, 2, %o0 ! get next byte bcc,a,pt %ncc, 1b ldub [%o0 + %o3], %o4 2: membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return (0) SET_SIZE(xcopyout_little) /* * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) */ ENTRY(copyin) cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case bleu,pt %ncc, .copyin_small ! go to larger cases xor %o0, %o1, %o3 ! are src, dst alignable? btst 7, %o3 ! bz,pt %ncc, .copyin_8 ! check for longword alignment nop btst 1, %o3 ! bz,pt %ncc, .copyin_2 ! check for half-word nop sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_1)], %o3 tst %o3 bz,pn %icc, .copyin_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyin_small ! go to small copy nop ba,pt %ncc, .copyin_more ! otherwise go to large copy nop .copyin_2: btst 3, %o3 ! bz,pt %ncc, .copyin_4 ! check for word alignment nop sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_2)], %o3 tst %o3 bz,pn %icc, .copyin_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyin_small ! go to small copy nop ba,pt %ncc, .copyin_more ! otherwise go to large copy nop .copyin_4: ! already checked longword, must be word aligned sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_4)], %o3 tst %o3 bz,pn %icc, .copyin_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyin_small ! go to small copy nop ba,pt %ncc, .copyin_more ! otherwise go to large copy nop .copyin_8: sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_8)], %o3 tst %o3 bz,pn %icc, .copyin_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyin_small ! go to small copy nop ba,pt %ncc, .copyin_more ! otherwise go to large copy nop .align 16 nop ! instruction alignment ! see discussion at start of file .copyin_small: sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault or %o5, %lo(.sm_copyin_err), %o5 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] .sm_do_copyin: mov %o0, SM_SAVE_SRC mov %o1, SM_SAVE_DST cmp %o2, SHORTCOPY ! check for really short case bleu,pt %ncc, .ci_sm_left ! mov %o2, SM_SAVE_COUNT cmp %o2, CHKSIZE ! check for medium length cases bgu,pn %ncc, .ci_med ! or %o0, %o1, %o3 ! prepare alignment check andcc %o3, 0x3, %g0 ! test for alignment bz,pt %ncc, .ci_sm_word ! branch to word aligned case .ci_sm_movebytes: sub %o2, 3, %o2 ! adjust count to allow cc zero test .ci_sm_notalign4: lduba [%o0]ASI_USER, %o3 ! read byte subcc %o2, 4, %o2 ! reduce count by 4 stb %o3, [%o1] ! write byte add %o0, 1, %o0 ! advance SRC by 1 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes add %o0, 1, %o0 ! advance SRC by 1 stb %o3, [%o1 + 1] add %o1, 4, %o1 ! advance DST by 4 lduba [%o0]ASI_USER, %o3 add %o0, 1, %o0 ! advance SRC by 1 stb %o3, [%o1 - 2] lduba [%o0]ASI_USER, %o3 add %o0, 1, %o0 ! advance SRC by 1 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain stb %o3, [%o1 - 1] add %o2, 3, %o2 ! restore count .ci_sm_left: tst %o2 bz,pt %ncc, .ci_sm_exit nop lduba [%o0]ASI_USER, %o3 ! load one byte deccc %o2 ! reduce count for cc test bz,pt %ncc, .ci_sm_exit stb %o3,[%o1] ! store one byte inc %o0 lduba [%o0]ASI_USER, %o3 ! load second byte deccc %o2 bz,pt %ncc, .ci_sm_exit stb %o3,[%o1 + 1] ! store second byte inc %o0 lduba [%o0]ASI_USER, %o3 ! load third byte stb %o3,[%o1 + 2] ! store third byte membar #Sync ! sync error barrier stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return 0 .align 16 .ci_sm_words: lduwa [%o0]ASI_USER, %o3 ! read word .ci_sm_wordx: subcc %o2, 8, %o2 ! update count stw %o3, [%o1] ! write word add %o0, 4, %o0 ! update SRC add %o1, 8, %o1 ! update DST lduwa [%o0]ASI_USER, %o3 ! read word add %o0, 4, %o0 ! update SRC bgt,pt %ncc, .ci_sm_words ! loop til done stw %o3, [%o1 - 4] ! write word addcc %o2, 7, %o2 ! restore count bz,pt %ncc, .ci_sm_exit nop deccc %o2 bz,pt %ncc, .ci_sm_byte .ci_sm_half: subcc %o2, 2, %o2 ! reduce count by 2 lduha [%o0]ASI_USER, %o3 ! read half word add %o0, 2, %o0 ! advance SRC by 2 add %o1, 2, %o1 ! advance DST by 2 bgt,pt %ncc, .ci_sm_half ! loop til done sth %o3, [%o1 - 2] ! write half word addcc %o2, 1, %o2 ! restore count bz,pt %ncc, .ci_sm_exit nop .ci_sm_byte: lduba [%o0]ASI_USER, %o3 stb %o3, [%o1] membar #Sync ! sync error barrier stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return 0 .align 16 .ci_sm_word: subcc %o2, 4, %o2 ! update count bgt,pt %ncc, .ci_sm_wordx lduwa [%o0]ASI_USER, %o3 ! read word addcc %o2, 3, %o2 ! restore count bz,pt %ncc, .ci_sm_exit stw %o3, [%o1] ! write word deccc %o2 ! reduce count for cc test add %o0, 4, %o0 lduba [%o0]ASI_USER, %o3 ! load one byte bz,pt %ncc, .ci_sm_exit stb %o3, [%o1 + 4] ! store one byte inc %o0 lduba [%o0]ASI_USER, %o3 ! load second byte deccc %o2 bz,pt %ncc, .ci_sm_exit stb %o3, [%o1 + 5] ! store second byte inc %o0 lduba [%o0]ASI_USER, %o3 ! load third byte stb %o3, [%o1 + 6] ! store third byte .ci_sm_exit: membar #Sync ! sync error barrier stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return 0 .align 16 .ci_med: xor %o0, %o1, %o3 ! setup alignment check btst 1, %o3 bnz,pt %ncc, .ci_sm_movebytes ! unaligned nop btst 3, %o3 bnz,pt %ncc, .ci_med_half ! halfword aligned nop btst 7, %o3 bnz,pt %ncc, .ci_med_word ! word aligned nop .ci_med_long: btst 3, %o0 ! check for bz,pt %ncc, .ci_med_long1 ! word alignment nop .ci_med_long0: lduba [%o0]ASI_USER, %o3 ! load one byte inc %o0 stb %o3,[%o1] ! store byte inc %o1 btst 3, %o0 bnz,pt %ncc, .ci_med_long0 dec %o2 .ci_med_long1: ! word aligned btst 7, %o0 ! check for long word bz,pt %ncc, .ci_med_long2 nop lduwa [%o0]ASI_USER, %o3 ! load word add %o0, 4, %o0 ! advance SRC by 4 stw %o3, [%o1] ! store word add %o1, 4, %o1 ! advance DST by 4 sub %o2, 4, %o2 ! reduce count by 4 ! ! Now long word aligned and have at least 32 bytes to move ! .ci_med_long2: sub %o2, 31, %o2 ! adjust count to allow cc zero test .ci_med_lmove: ldxa [%o0]ASI_USER, %o3 ! read long word subcc %o2, 32, %o2 ! reduce count by 32 stx %o3, [%o1] ! write long word add %o0, 8, %o0 ! advance SRC by 8 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words add %o0, 8, %o0 ! advance SRC by 8 stx %o3, [%o1 + 8] add %o1, 32, %o1 ! advance DST by 32 ldxa [%o0]ASI_USER, %o3 add %o0, 8, %o0 ! advance SRC by 8 stx %o3, [%o1 - 16] ldxa [%o0]ASI_USER, %o3 add %o0, 8, %o0 ! advance SRC by 8 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left stx %o3, [%o1 - 8] addcc %o2, 24, %o2 ! restore count to long word offset ble,pt %ncc, .ci_med_lextra ! check for more long words to move nop .ci_med_lword: ldxa [%o0]ASI_USER, %o3 ! read long word subcc %o2, 8, %o2 ! reduce count by 8 stx %o3, [%o1] ! write long word add %o0, 8, %o0 ! advance SRC by 8 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left add %o1, 8, %o1 ! advance DST by 8 .ci_med_lextra: addcc %o2, 7, %o2 ! restore rest of count bz,pt %ncc, .ci_sm_exit ! if zero, then done deccc %o2 bz,pt %ncc, .ci_sm_byte nop ba,pt %ncc, .ci_sm_half nop .align 16 nop ! instruction alignment ! see discussion at start of file .ci_med_word: btst 3, %o0 ! check for bz,pt %ncc, .ci_med_word1 ! word alignment nop .ci_med_word0: lduba [%o0]ASI_USER, %o3 ! load one byte inc %o0 stb %o3,[%o1] ! store byte inc %o1 btst 3, %o0 bnz,pt %ncc, .ci_med_word0 dec %o2 ! ! Now word aligned and have at least 36 bytes to move ! .ci_med_word1: sub %o2, 15, %o2 ! adjust count to allow cc zero test .ci_med_wmove: lduwa [%o0]ASI_USER, %o3 ! read word subcc %o2, 16, %o2 ! reduce count by 16 stw %o3, [%o1] ! write word add %o0, 4, %o0 ! advance SRC by 4 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words add %o0, 4, %o0 ! advance SRC by 4 stw %o3, [%o1 + 4] add %o1, 16, %o1 ! advance DST by 16 lduwa [%o0]ASI_USER, %o3 add %o0, 4, %o0 ! advance SRC by 4 stw %o3, [%o1 - 8] lduwa [%o0]ASI_USER, %o3 add %o0, 4, %o0 ! advance SRC by 4 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left stw %o3, [%o1 - 4] addcc %o2, 12, %o2 ! restore count to word offset ble,pt %ncc, .ci_med_wextra ! check for more words to move nop .ci_med_word2: lduwa [%o0]ASI_USER, %o3 ! read word subcc %o2, 4, %o2 ! reduce count by 4 stw %o3, [%o1] ! write word add %o0, 4, %o0 ! advance SRC by 4 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left add %o1, 4, %o1 ! advance DST by 4 .ci_med_wextra: addcc %o2, 3, %o2 ! restore rest of count bz,pt %ncc, .ci_sm_exit ! if zero, then done deccc %o2 bz,pt %ncc, .ci_sm_byte nop ba,pt %ncc, .ci_sm_half nop .align 16 nop ! instruction alignment ! see discussion at start of file .ci_med_half: btst 1, %o0 ! check for bz,pt %ncc, .ci_med_half1 ! half word alignment nop lduba [%o0]ASI_USER, %o3 ! load one byte inc %o0 stb %o3,[%o1] ! store byte inc %o1 dec %o2 ! ! Now half word aligned and have at least 38 bytes to move ! .ci_med_half1: sub %o2, 7, %o2 ! adjust count to allow cc zero test .ci_med_hmove: lduha [%o0]ASI_USER, %o3 ! read half word subcc %o2, 8, %o2 ! reduce count by 8 sth %o3, [%o1] ! write half word add %o0, 2, %o0 ! advance SRC by 2 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords add %o0, 2, %o0 ! advance SRC by 2 sth %o3, [%o1 + 2] add %o1, 8, %o1 ! advance DST by 8 lduha [%o0]ASI_USER, %o3 add %o0, 2, %o0 ! advance SRC by 2 sth %o3, [%o1 - 4] lduha [%o0]ASI_USER, %o3 add %o0, 2, %o0 ! advance SRC by 2 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left sth %o3, [%o1 - 2] addcc %o2, 7, %o2 ! restore count bz,pt %ncc, .ci_sm_exit deccc %o2 bz,pt %ncc, .ci_sm_byte nop ba,pt %ncc, .ci_sm_half nop .sm_copyin_err: membar #Sync stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault mov SM_SAVE_SRC, %o0 mov SM_SAVE_DST, %o1 mov SM_SAVE_COUNT, %o2 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler tst %o3 bz,pt %ncc, 3f ! if not, return error nop ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with jmp %o5 ! original arguments nop 3: retl or %g0, -1, %o0 ! return errno value SET_SIZE(copyin) /* * The _more entry points are not intended to be used directly by * any caller from outside this file. They are provided to allow * profiling and dtrace of the portions of the copy code that uses * the floating point registers. * This entry is particularly important as DTRACE (at least as of * 4/2004) does not support leaf functions. */ ENTRY(copyin_more) .copyin_more: save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp set .copyin_err, REAL_LOFAULT /* * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes */ .do_copyin: set copyio_fault, %l7 ! .copyio_fault is lofault val ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler membar #Sync ! sync error barrier stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault mov %i0, SAVE_SRC mov %i1, SAVE_DST mov %i2, SAVE_COUNT FP_NOMIGRATE(6, 7) rd %fprs, %o2 ! check for unused fp st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs btst FPRS_FEF, %o2 bz,a,pt %icc, .do_blockcopyin wr %g0, FPRS_FEF, %fprs BST_FPQ2Q4_TOSTACK(%o2) .do_blockcopyin: rd %gsr, %o2 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr or %l6, FPUSED_FLAG, %l6 andcc DST, VIS_BLOCKSIZE - 1, TMP mov ASI_USER, %asi bz,pt %ncc, 2f neg TMP add TMP, VIS_BLOCKSIZE, TMP ! TMP = bytes required to align DST on FP_BLOCK boundary ! Using SRC as a tmp here cmp TMP, 3 bleu,pt %ncc, 1f sub CNT,TMP,CNT ! adjust main count sub TMP, 3, TMP ! adjust for end of loop test .ci_blkalign: lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration stb SRC, [DST] subcc TMP, 4, TMP lduba [REALSRC + 1]%asi, SRC add REALSRC, 4, REALSRC stb SRC, [DST + 1] lduba [REALSRC - 2]%asi, SRC add DST, 4, DST stb SRC, [DST - 2] lduba [REALSRC - 1]%asi, SRC bgu,pt %ncc, .ci_blkalign stb SRC, [DST - 1] addcc TMP, 3, TMP ! restore count adjustment bz,pt %ncc, 2f ! no bytes left? nop 1: lduba [REALSRC]%asi, SRC inc REALSRC inc DST deccc TMP bgu %ncc, 1b stb SRC, [DST - 1] 2: andn REALSRC, 0x7, SRC alignaddr REALSRC, %g0, %g0 ! SRC - 8-byte aligned ! DST - 64-byte aligned prefetcha [SRC]%asi, #one_read prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read ldda [SRC]%asi, %f16 #if CHEETAH_PREFETCH > 4 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read #endif ldda [SRC + 0x08]%asi, %f18 #if CHEETAH_PREFETCH > 5 prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read #endif ldda [SRC + 0x10]%asi, %f20 #if CHEETAH_PREFETCH > 6 prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read #endif faligndata %f16, %f18, %f48 ldda [SRC + 0x18]%asi, %f22 #if CHEETAH_PREFETCH > 7 prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read #endif faligndata %f18, %f20, %f50 ldda [SRC + 0x20]%asi, %f24 faligndata %f20, %f22, %f52 ldda [SRC + 0x28]%asi, %f26 faligndata %f22, %f24, %f54 ldda [SRC + 0x30]%asi, %f28 faligndata %f24, %f26, %f56 ldda [SRC + 0x38]%asi, %f30 faligndata %f26, %f28, %f58 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 sub CNT, VIS_BLOCKSIZE, CNT add SRC, VIS_BLOCKSIZE, SRC add REALSRC, VIS_BLOCKSIZE, REALSRC ba,a,pt %ncc, 1f nop .align 16 1: ldda [SRC + 0x08]%asi, %f18 faligndata %f28, %f30, %f60 ldda [SRC + 0x10]%asi, %f20 faligndata %f30, %f16, %f62 stda %f48, [DST]ASI_BLK_P ldda [SRC + 0x18]%asi, %f22 faligndata %f16, %f18, %f48 ldda [SRC + 0x20]%asi, %f24 faligndata %f18, %f20, %f50 ldda [SRC + 0x28]%asi, %f26 faligndata %f20, %f22, %f52 ldda [SRC + 0x30]%asi, %f28 faligndata %f22, %f24, %f54 ldda [SRC + 0x38]%asi, %f30 faligndata %f24, %f26, %f56 sub CNT, VIS_BLOCKSIZE, CNT ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 faligndata %f26, %f28, %f58 prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read add DST, VIS_BLOCKSIZE, DST prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read add REALSRC, VIS_BLOCKSIZE, REALSRC cmp CNT, VIS_BLOCKSIZE + 8 bgu,pt %ncc, 1b add SRC, VIS_BLOCKSIZE, SRC ! only if REALSRC & 0x7 is 0 cmp CNT, VIS_BLOCKSIZE bne %ncc, 3f andcc REALSRC, 0x7, %g0 bz,pt %ncc, 2f nop 3: faligndata %f28, %f30, %f60 faligndata %f30, %f16, %f62 stda %f48, [DST]ASI_BLK_P add DST, VIS_BLOCKSIZE, DST ba,pt %ncc, 3f nop 2: ldda [SRC + 0x08]%asi, %f18 fsrc1 %f28, %f60 ldda [SRC + 0x10]%asi, %f20 fsrc1 %f30, %f62 stda %f48, [DST]ASI_BLK_P ldda [SRC + 0x18]%asi, %f22 fsrc1 %f16, %f48 ldda [SRC + 0x20]%asi, %f24 fsrc1 %f18, %f50 ldda [SRC + 0x28]%asi, %f26 fsrc1 %f20, %f52 ldda [SRC + 0x30]%asi, %f28 fsrc1 %f22, %f54 ldda [SRC + 0x38]%asi, %f30 fsrc1 %f24, %f56 sub CNT, VIS_BLOCKSIZE, CNT add DST, VIS_BLOCKSIZE, DST add SRC, VIS_BLOCKSIZE, SRC add REALSRC, VIS_BLOCKSIZE, REALSRC fsrc1 %f26, %f58 fsrc1 %f28, %f60 fsrc1 %f30, %f62 stda %f48, [DST]ASI_BLK_P add DST, VIS_BLOCKSIZE, DST ba,a,pt %ncc, 4f nop 3: tst CNT bz,a %ncc, 4f nop 5: lduba [REALSRC]ASI_USER, TMP inc REALSRC inc DST deccc CNT bgu %ncc, 5b stb TMP, [DST - 1] 4: .copyin_exit: membar #Sync FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8) FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9) FPRAS_CHECK(FPRAS_COPYIN, %l5, 9) ! lose outputs ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr wr %o2, 0, %gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz,pt %icc, 4f nop BLD_FPQ2Q4_FROMSTACK(%o2) ba,pt %ncc, 1f wr %o3, 0, %fprs ! restore fprs 4: FZEROQ2Q4 wr %o3, 0, %fprs ! restore fprs 1: membar #Sync ! sync error barrier andn %l6, FPUSED_FLAG, %l6 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault FP_ALLOWMIGRATE(5, 6) ret restore %g0, 0, %o0 /* * We got here because of a fault during copyin * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). */ .copyin_err: ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler tst %o4 bz,pt %ncc, 2f ! if not, return error nop ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with jmp %g2 ! original arguments restore %g0, 0, %g0 ! dispose of copy window 2: ret restore %g0, -1, %o0 ! return error value SET_SIZE(copyin_more) ENTRY(xcopyin) cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case bleu,pt %ncc, .xcopyin_small ! go to larger cases xor %o0, %o1, %o3 ! are src, dst alignable? btst 7, %o3 ! bz,pt %ncc, .xcopyin_8 ! check for longword alignment nop btst 1, %o3 ! bz,pt %ncc, .xcopyin_2 ! check for half-word nop sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_1)], %o3 tst %o3 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .xcopyin_small ! go to small copy nop ba,pt %ncc, .xcopyin_more ! otherwise go to large copy nop .xcopyin_2: btst 3, %o3 ! bz,pt %ncc, .xcopyin_4 ! check for word alignment nop sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_2)], %o3 tst %o3 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .xcopyin_small ! go to small copy nop ba,pt %ncc, .xcopyin_more ! otherwise go to large copy nop .xcopyin_4: ! already checked longword, must be word aligned sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_4)], %o3 tst %o3 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .xcopyin_small ! go to small copy nop ba,pt %ncc, .xcopyin_more ! otherwise go to large copy nop .xcopyin_8: sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_8)], %o3 tst %o3 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .xcopyin_small ! go to small copy nop ba,pt %ncc, .xcopyin_more ! otherwise go to large copy nop .xcopyin_small: sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value or %o5, %lo(.sm_xcopyin_err), %o5 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul membar #Sync ! sync error barrier ba,pt %ncc, .sm_do_copyin ! common code stn %o5, [THREAD_REG + T_LOFAULT] .xcopyin_more: save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value ba,pt %ncc, .do_copyin or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT /* * We got here because of fault during xcopyin * Errno value is in ERRNO */ .xcopyin_err: ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler tst %o4 bz,pt %ncc, 2f ! if not, return error nop ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with jmp %g2 ! original arguments restore %g0, 0, %g0 ! dispose of copy window 2: ret restore ERRNO, 0, %o0 ! return errno value .sm_xcopyin_err: membar #Sync stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault mov SM_SAVE_SRC, %o0 mov SM_SAVE_DST, %o1 mov SM_SAVE_COUNT, %o2 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler tst %o3 bz,pt %ncc, 3f ! if not, return error nop ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with jmp %o5 ! original arguments nop 3: retl or %g1, 0, %o0 ! return errno value SET_SIZE(xcopyin) ENTRY(xcopyin_little) sethi %hi(.xcopyio_err), %o5 or %o5, %lo(.xcopyio_err), %o5 ldn [THREAD_REG + T_LOFAULT], %o4 membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] mov %o4, %o5 subcc %g0, %o2, %o3 add %o0, %o2, %o0 bz,pn %ncc, 2f ! check for zero bytes sub %o2, 1, %o4 add %o0, %o4, %o0 ! start w/last byte add %o1, %o2, %o1 lduba [%o0 + %o3]ASI_AIUSL, %o4 1: stb %o4, [%o1 + %o3] inccc %o3 sub %o0, 2, %o0 ! get next byte bcc,a,pt %ncc, 1b lduba [%o0 + %o3]ASI_AIUSL, %o4 2: membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return (0) .xcopyio_err: membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g1, %o0 SET_SIZE(xcopyin_little) /* * Copy a block of storage - must not overlap (from + len <= to). * No fault handler installed (to be called under on_fault()) */ ENTRY(copyin_noerr) cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case bleu,pt %ncc, .copyin_ne_small ! go to larger cases xor %o0, %o1, %o3 ! are src, dst alignable? btst 7, %o3 ! bz,pt %ncc, .copyin_ne_8 ! check for longword alignment nop btst 1, %o3 ! bz,pt %ncc, .copyin_ne_2 ! check for half-word nop sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_1)], %o3 tst %o3 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyin_ne_small ! go to small copy nop ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy nop .copyin_ne_2: btst 3, %o3 ! bz,pt %ncc, .copyin_ne_4 ! check for word alignment nop sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_2)], %o3 tst %o3 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyin_ne_small ! go to small copy nop ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy nop .copyin_ne_4: ! already checked longword, must be word aligned sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_4)], %o3 tst %o3 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyin_ne_small ! go to small copy nop ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy nop .copyin_ne_8: sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_8)], %o3 tst %o3 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyin_ne_small ! go to small copy nop ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy nop .copyin_ne_small: ldn [THREAD_REG + T_LOFAULT], %o4 tst %o4 bz,pn %ncc, .sm_do_copyin nop sethi %hi(.sm_copyio_noerr), %o5 or %o5, %lo(.sm_copyio_noerr), %o5 membar #Sync ! sync error barrier ba,pt %ncc, .sm_do_copyin stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault .copyin_noerr_more: save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp sethi %hi(.copyio_noerr), REAL_LOFAULT ba,pt %ncc, .do_copyin or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT .copyio_noerr: jmp %l6 restore %g0,0,%g0 .sm_copyio_noerr: membar #Sync stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault jmp %o4 nop SET_SIZE(copyin_noerr) /* * Copy a block of storage - must not overlap (from + len <= to). * No fault handler installed (to be called under on_fault()) */ ENTRY(copyout_noerr) cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case bleu,pt %ncc, .copyout_ne_small ! go to larger cases xor %o0, %o1, %o3 ! are src, dst alignable? btst 7, %o3 ! bz,pt %ncc, .copyout_ne_8 ! check for longword alignment nop btst 1, %o3 ! bz,pt %ncc, .copyout_ne_2 ! check for half-word nop sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_1)], %o3 tst %o3 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyout_ne_small ! go to small copy nop ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy nop .copyout_ne_2: btst 3, %o3 ! bz,pt %ncc, .copyout_ne_4 ! check for word alignment nop sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_2)], %o3 tst %o3 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyout_ne_small ! go to small copy nop ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy nop .copyout_ne_4: ! already checked longword, must be word aligned sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_4)], %o3 tst %o3 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyout_ne_small ! go to small copy nop ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy nop .copyout_ne_8: sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit ld [%o3 + %lo(hw_copy_limit_8)], %o3 tst %o3 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy cmp %o2, %o3 ! if length <= limit bleu,pt %ncc, .copyout_ne_small ! go to small copy nop ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy nop .copyout_ne_small: ldn [THREAD_REG + T_LOFAULT], %o4 tst %o4 bz,pn %ncc, .sm_do_copyout nop sethi %hi(.sm_copyio_noerr), %o5 or %o5, %lo(.sm_copyio_noerr), %o5 membar #Sync ! sync error barrier ba,pt %ncc, .sm_do_copyout stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault .copyout_noerr_more: save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp sethi %hi(.copyio_noerr), REAL_LOFAULT ba,pt %ncc, .do_copyout or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT SET_SIZE(copyout_noerr) /* * hwblkclr - clears block-aligned, block-multiple-sized regions that are * longer than 256 bytes in length using spitfire's block stores. If * the criteria for using this routine are not met then it calls bzero * and returns 1. Otherwise 0 is returned indicating success. * Caller is responsible for ensuring use_hw_bzero is true and that * kpreempt_disable() has been called. */ ! %i0 - start address ! %i1 - length of region (multiple of 64) ! %l0 - saved fprs ! %l1 - pointer to saved %d0 block ! %l2 - saved curthread->t_lwp ENTRY(hwblkclr) ! get another window w/space for one aligned block of saved fpregs save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp ! Must be block-aligned andcc %i0, (VIS_BLOCKSIZE-1), %g0 bnz,pn %ncc, 1f nop ! ... and must be 256 bytes or more cmp %i1, 256 blu,pn %ncc, 1f nop ! ... and length must be a multiple of VIS_BLOCKSIZE andcc %i1, (VIS_BLOCKSIZE-1), %g0 bz,pn %ncc, 2f nop 1: ! punt, call bzero but notify the caller that bzero was used mov %i0, %o0 call bzero mov %i1, %o1 ret restore %g0, 1, %o0 ! return (1) - did not use block operations 2: rd %fprs, %l0 ! check for unused fp btst FPRS_FEF, %l0 bz,pt %icc, 1f nop ! save in-use fpregs on stack membar #Sync add %fp, STACK_BIAS - 65, %l1 and %l1, -VIS_BLOCKSIZE, %l1 stda %d0, [%l1]ASI_BLK_P 1: membar #StoreStore|#StoreLoad|#LoadStore wr %g0, FPRS_FEF, %fprs wr %g0, ASI_BLK_P, %asi ! Clear block fzero %d0 fzero %d2 fzero %d4 fzero %d6 fzero %d8 fzero %d10 fzero %d12 fzero %d14 mov 256, %i3 ba,pt %ncc, .pz_doblock nop .pz_blkstart: ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here stda %d0, [%i0 + 128]%asi stda %d0, [%i0 + 64]%asi stda %d0, [%i0]%asi .pz_zinst: add %i0, %i3, %i0 sub %i1, %i3, %i1 .pz_doblock: cmp %i1, 256 bgeu,a %ncc, .pz_blkstart stda %d0, [%i0 + 192]%asi cmp %i1, 64 blu %ncc, .pz_finish andn %i1, (64-1), %i3 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words set .pz_zinst, %i4 sub %i4, %i2, %i4 jmp %i4 nop .pz_finish: membar #Sync btst FPRS_FEF, %l0 bz,a .pz_finished wr %l0, 0, %fprs ! restore fprs ! restore fpregs from stack ldda [%l1]ASI_BLK_P, %d0 membar #Sync wr %l0, 0, %fprs ! restore fprs .pz_finished: ret restore %g0, 0, %o0 ! return (bzero or not) SET_SIZE(hwblkclr) /* * Copy 32 bytes of data from src (%o0) to dst (%o1) * using physical addresses. */ ENTRY_NP(hw_pa_bcopy32) rdpr %pstate, %g1 andn %g1, PSTATE_IE, %g2 wrpr %g0, %g2, %pstate rdpr %pstate, %g0 ldxa [%o0]ASI_MEM, %o2 add %o0, 8, %o0 ldxa [%o0]ASI_MEM, %o3 add %o0, 8, %o0 ldxa [%o0]ASI_MEM, %o4 add %o0, 8, %o0 ldxa [%o0]ASI_MEM, %o5 stxa %g0, [%o1]ASI_DC_INVAL membar #Sync stxa %o2, [%o1]ASI_MEM add %o1, 8, %o1 stxa %o3, [%o1]ASI_MEM add %o1, 8, %o1 stxa %o4, [%o1]ASI_MEM add %o1, 8, %o1 stxa %o5, [%o1]ASI_MEM retl wrpr %g0, %g1, %pstate SET_SIZE(hw_pa_bcopy32) DGDEF(use_hw_bcopy) .word 1 DGDEF(use_hw_bzero) .word 1 DGDEF(hw_copy_limit_1) .word 0 DGDEF(hw_copy_limit_2) .word 0 DGDEF(hw_copy_limit_4) .word 0 DGDEF(hw_copy_limit_8) .word 0 .align 64 .section ".text"