|
Hi all,
I took some suggestions from Goran Bilski and the
trials that Falk had been doing trying to speed up the Ethernet, and built
assembler memcpy and memmove functions that incorporate some loop unrolling,
etc. to try and get uniformly faster moves and copies.
General code flow for both ascending and descending
transfers is:
1. Transfer 1-byte per loop to get to an
aligned destination address.
2. Transfer 8-word blocks per loop if at
least 8 words left to transfer, using unrolled loops per Goran's
suggestions.
3. Transfer 1-word per loop if at least 1 word left
to transfer.
4. Transfer 1-byte per loop to finish.
I kept the ascending move (slightly slower than
descending move) for the memcpy function, just in case anyone is dependant on
this behaviour. If anyone with more experience thinks (knows?)
that the direction of memcpy's operation is irrelevant to every program
ever written <G>, then i can do a quick change and get slightly better
performance on memcpy moves.
Attached is the fastcopy.S file that I place and build in
/arch/microblaze/lib. I just commented out the C versions of memcpy and
memmove in their respective .c files in the same directory.
Obviously this change won't make a day and night difference on small moves,
but might be noticeable on larger blocks of data.
Jim Law
Iris Power LP
Please note that Iris Power LP Canada has relocated
its Toronto operation to 3110 American Drive, Mississauga L4V 1T2, right next to
the Toronto airport. Our phone and fax numbers remain the same, as does the
level of service we offer our clients. For further details please visit
our website at www.irispower.com
|
###################################-*-asm*- # # Copyright 2008 (c) Jim Law - Iris LP All rights reserved. # # This file is subject to the terms and conditions of the GNU General # Public License. See the file COPYING in the main directory of this # archive for more details. # # Written by Jim Law <jlaw@xxxxxxxxxxxxx> # # intended to replace: # memcpy in memcpy.c and # memmove in memmove.c # ... in arch/microblaze/lib # # # assly_fastcopy.S # # Attempt at quicker memcpy and memmove for MicroBlaze # Input : Operand1 in Reg r5 - destination address # Operand2 in Reg r6 - source address # Operand3 in Reg r7 - number of bytes to transfer # Output: Result in Reg r3 - starting destinaition address # # # Explanation: # Perform (possibly unaligned) copy of a block of memory # between mem locations with size of xfer spec'd in bytes # # ####################################### #include <asm/clinkage.h> .globl C_SYMBOL_NAME(memcpy) .ent C_SYMBOL_NAME(memcpy) C_SYMBOL_NAME(memcpy): fast_memcpy_ascending: addi r3,r5,0 # move d to return register as value of function addi r4,r0,4 # n = 4 cmpu r4,r4,r7 # n = c - n (unsigned) blti r4,a_xfer_end # if n < 0, less than one word to transfer # transfer first 0~3 bytes to get aligned dest address andi r4,r5,3 # n = d & 3 beqi r4,a_dalign_done # if zero, destination already aligned rsubi r4,r4,4 # n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) rsub r7,r4,r7 # c = c - n adjust c a_xfer_first_loop: beqi r4,a_dalign_done # if no bytes left to transfer, transfer the bulk lbui r11,r6,0 # h = *s sbi r11,r5,0 # *d = h addi r6,r6,1 # s++ addi r5,r5,1 # d++ brid a_xfer_first_loop # loop addi r4,r4,-1 # n-- (IN DELAY SLOT) a_dalign_done: addi r4,r0,32 # n = 32 cmpu r4,r4,r7 # n = c - n (unsigned) blti r4,a_block_done # if n < 0, less than one block to transfer a_block_xfer: andi r4,r7,0xffffffe0 # n = c & ~31 rsub r7,r4,r7 # c = c - n andi r9,r6,3 # t1 = s & 3 bnei r9,a_block_unaligned # if temp != 0, unaligned transfers needed a_block_aligned: lwi r9,r6,0 # t1 = *(s + 0) lwi r10,r6,4 # t2 = *(s + 4) lwi r11,r6,8 # t3 = *(s + 8) lwi r12,r6,12 # t4 = *(s + 12) swi r9,r5,0 # *(d + 0) = t1 swi r10,r5,4 # *(d + 4) = t2 swi r11,r5,8 # *(d + 8) = t3 swi r12,r5,12 # *(d + 12) = t4 lwi r9,r6,16 # t1 = *(s + 16) lwi r10,r6,20 # t2 = *(s + 20) lwi r11,r6,24 # t3 = *(s + 24) lwi r12,r6,28 # t4 = *(s + 28) swi r9,r5,16 # *(d + 16) = t1 swi r10,r5,20 # *(d + 20) = t2 swi r11,r5,24 # *(d + 24) = t3 swi r12,r5,28 # *(d + 28) = t4 addi r6,r6,32 # s = s + 32 addi r4,r4,-32 # n = n - 32 bneid r4,a_block_aligned # while (n) loop addi r5,r5,32 # d = d + 32 (IN DELAY SLOT) bri a_block_done a_block_unaligned: andi r8,r6,0xfffffffc # as = s & ~3 add r6,r6,r4 # s = s + n lwi r11,r8,0 # h = *(as + 0) addi r9,r9,-1 beqi r9,a_block_u1 # t1 was 1 => 1 byte offset addi r9,r9,-1 beqi r9,a_block_u2 # t1 was 2 => 2 byte offset a_block_u3: bslli r11,r11,24 # h = h << 24 a_bu3_loop: lwi r12,r8,4 # v = *(as + 4) bsrli r9,r12,8 # t1 = v >> 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,0 # *(d + 0) = t1 bslli r11,r12,24 # h = v << 24 lwi r12,r8,8 # v = *(as + 8) bsrli r9,r12,8 # t1 = v >> 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,4 # *(d + 4) = t1 bslli r11,r12,24 # h = v << 24 lwi r12,r8,12 # v = *(as + 12) bsrli r9,r12,8 # t1 = v >> 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,8 # *(d + 8) = t1 bslli r11,r12,24 # h = v << 24 lwi r12,r8,16 # v = *(as + 16) bsrli r9,r12,8 # t1 = v >> 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,12 # *(d + 12) = t1 bslli r11,r12,24 # h = v << 24 lwi r12,r8,20 # v = *(as + 20) bsrli r9,r12,8 # t1 = v >> 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,16 # *(d + 16) = t1 bslli r11,r12,24 # h = v << 24 lwi r12,r8,24 # v = *(as + 24) bsrli r9,r12,8 # t1 = v >> 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,20 # *(d + 20) = t1 bslli r11,r12,24 # h = v << 24 lwi r12,r8,28 # v = *(as + 28) bsrli r9,r12,8 # t1 = v >> 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,24 # *(d + 24) = t1 bslli r11,r12,24 # h = v << 24 lwi r12,r8,32 # v = *(as + 32) bsrli r9,r12,8 # t1 = v >> 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,28 # *(d + 28) = t1 bslli r11,r12,24 # h = v << 24 addi r8,r8,32 # as = as + 32 addi r4,r4,-32 # n = n - 32 bneid r4,a_bu3_loop # while (n) loop addi r5,r5,32 # d = d + 32 (IN DELAY SLOT) bri a_block_done a_block_u1: bslli r11,r11,8 # h = h << 8 a_bu1_loop: lwi r12,r8,4 # v = *(as + 4) bsrli r9,r12,24 # t1 = v >> 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,0 # *(d + 0) = t1 bslli r11,r12,8 # h = v << 8 lwi r12,r8,8 # v = *(as + 8) bsrli r9,r12,24 # t1 = v >> 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,4 # *(d + 4) = t1 bslli r11,r12,8 # h = v << 8 lwi r12,r8,12 # v = *(as + 12) bsrli r9,r12,24 # t1 = v >> 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,8 # *(d + 8) = t1 bslli r11,r12,8 # h = v << 8 lwi r12,r8,16 # v = *(as + 16) bsrli r9,r12,24 # t1 = v >> 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,12 # *(d + 12) = t1 bslli r11,r12,8 # h = v << 8 lwi r12,r8,20 # v = *(as + 20) bsrli r9,r12,24 # t1 = v >> 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,16 # *(d + 16) = t1 bslli r11,r12,8 # h = v << 8 lwi r12,r8,24 # v = *(as + 24) bsrli r9,r12,24 # t1 = v >> 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,20 # *(d + 20) = t1 bslli r11,r12,8 # h = v << 8 lwi r12,r8,28 # v = *(as + 28) bsrli r9,r12,24 # t1 = v >> 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,24 # *(d + 24) = t1 bslli r11,r12,8 # h = v << 8 lwi r12,r8,32 # v = *(as + 32) bsrli r9,r12,24 # t1 = v >> 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,28 # *(d + 28) = t1 bslli r11,r12,8 # h = v << 8 addi r8,r8,32 # as = as + 32 addi r4,r4,-32 # n = n - 32 bneid r4,a_bu1_loop # while (n) loop addi r5,r5,32 # d = d + 32 (IN DELAY SLOT) bri a_block_done a_block_u2: bslli r11,r11,16 # h = h << 16 a_bu2_loop: lwi r12,r8,4 # v = *(as + 4) bsrli r9,r12,16 # t1 = v >> 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,0 # *(d + 0) = t1 bslli r11,r12,16 # h = v << 16 lwi r12,r8,8 # v = *(as + 8) bsrli r9,r12,16 # t1 = v >> 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,4 # *(d + 4) = t1 bslli r11,r12,16 # h = v << 16 lwi r12,r8,12 # v = *(as + 12) bsrli r9,r12,16 # t1 = v >> 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,8 # *(d + 8) = t1 bslli r11,r12,16 # h = v << 16 lwi r12,r8,16 # v = *(as + 16) bsrli r9,r12,16 # t1 = v >> 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,12 # *(d + 12) = t1 bslli r11,r12,16 # h = v << 16 lwi r12,r8,20 # v = *(as + 20) bsrli r9,r12,16 # t1 = v >> 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,16 # *(d + 16) = t1 bslli r11,r12,16 # h = v << 16 lwi r12,r8,24 # v = *(as + 24) bsrli r9,r12,16 # t1 = v >> 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,20 # *(d + 20) = t1 bslli r11,r12,16 # h = v << 16 lwi r12,r8,28 # v = *(as + 28) bsrli r9,r12,16 # t1 = v >> 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,24 # *(d + 24) = t1 bslli r11,r12,16 # h = v << 16 lwi r12,r8,32 # v = *(as + 32) bsrli r9,r12,16 # t1 = v >> 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,28 # *(d + 28) = t1 bslli r11,r12,16 # h = v << 16 addi r8,r8,32 # as = as + 32 addi r4,r4,-32 # n = n - 32 bneid r4,a_bu2_loop # while (n) loop addi r5,r5,32 # d = d + 32 (IN DELAY SLOT) a_block_done: addi r4,r0,4 # n = 4 cmpu r4,r4,r7 # n = c - n (unsigned) blti r4,a_xfer_end # if n < 0, less than one word to transfer a_word_xfer: andi r4,r7,0xfffffffc # n = c & ~3 addi r10,r0,0 # offset = 0 andi r9,r6,3 # t1 = s & 3 bnei r9,a_word_unaligned # if temp != 0, unaligned transfers needed a_word_aligned: lw r9,r6,r10 # t1 = *(s+offset) sw r9,r5,r10 # *(d+offset) = t1 addi r4,r4,-4 # n-- bneid r4,a_word_aligned # loop addi r10,r10,4 # offset++ (IN DELAY SLOT) bri a_word_done a_word_unaligned: andi r8,r6,0xfffffffc # as = s & ~3 lwi r11,r8,0 # h = *(as + 0) addi r8,r8,4 # as = as + 4 addi r9,r9,-1 beqi r9,a_word_u1 # t1 was 1 => 1 byte offset addi r9,r9,-1 beqi r9,a_word_u2 # t1 was 2 => 2 byte offset a_word_u3: bslli r11,r11,24 # h = h << 24 a_wu3_loop: lw r12,r8,r10 # v = *(as + offset) bsrli r9,r12,8 # t1 = v >> 8 or r9,r11,r9 # t1 = h | t1 sw r9,r5,r10 # *(d + offset) = t1 bslli r11,r12,24 # h = v << 24 addi r4,r4,-4 # n = n - 4 bneid r4,a_wu3_loop # while (n) loop addi r10,r10,4 # offset = ofset + 4 (IN DELAY SLOT) bri a_word_done a_word_u1: bslli r11,r11,8 # h = h << 8 a_wu1_loop: lw r12,r8,r10 # v = *(as + offset) bsrli r9,r12,24 # t1 = v >> 24 or r9,r11,r9 # t1 = h | t1 sw r9,r5,r10 # *(d + offset) = t1 bslli r11,r12,8 # h = v << 8 addi r4,r4,-4 # n = n - 4 bneid r4,a_wu1_loop # while (n) loop addi r10,r10,4 # offset = ofset + 4 (IN DELAY SLOT) bri a_word_done a_word_u2: bslli r11,r11,16 # h = h << 16 a_wu2_loop: lw r12,r8,r10 # v = *(as + offset) bsrli r9,r12,16 # t1 = v >> 16 or r9,r11,r9 # t1 = h | t1 sw r9,r5,r10 # *(d + offset) = t1 bslli r11,r12,16 # h = v << 16 addi r4,r4,-4 # n = n - 4 bneid r4,a_wu2_loop # while (n) loop addi r10,r10,4 # offset = ofset + 4 (IN DELAY SLOT) a_word_done: add r5,r5,r10 # d = d + offset add r6,r6,r10 # s = s + offset rsub r7,r10,r7 # c = c - offset a_xfer_end: a_xfer_end_loop: beqi r7,a_done # while (c) lbui r9,r6,0 # t1 = *s addi r6,r6,1 # s++ sbi r9,r5,0 # *d = t1 addi r7,r7,-1 # c-- brid a_xfer_end_loop # loop addi r5,r5,1 # d++ (IN DELAY SLOT) a_done: rtsd r15,8 nop .end C_SYMBOL_NAME(memcpy) ############################################################### .globl C_SYMBOL_NAME(memmove) .ent C_SYMBOL_NAME(memmove) C_SYMBOL_NAME(memmove): cmpu r4,r5,r6 # n = s - d bgei r4,fast_memcpy_ascending fast_memcpy_descending: addi r3,r5,0 # move d to return register as value of function add r5,r5,r7 # d = d + c add r6,r6,r7 # s = s + c addi r4,r0,4 # n = 4 cmpu r4,r4,r7 # n = c - n (unsigned) blti r4,d_xfer_end # if n < 0, less than one word to transfer # transfer first 0~3 bytes to get aligned dest address andi r4,r5,3 # n = d & 3 beqi r4,d_dalign_done # if zero, destination already aligned rsub r7,r4,r7 # c = c - n adjust c d_xfer_first_loop: beqi r4,d_dalign_done # if no bytes left to transfer, transfer the bulk addi r6,r6,-1 # s-- addi r5,r5,-1 # d-- lbui r11,r6,0 # h = *s sbi r11,r5,0 # *d = h brid d_xfer_first_loop # loop addi r4,r4,-1 # n-- (IN DELAY SLOT) d_dalign_done: addi r4,r0,32 # n = 32 cmpu r4,r4,r7 # n = c - n (unsigned) blti r4,d_block_done # if n < 0, less than one block to transfer d_block_xfer: andi r4,r7,0xffffffe0 # n = c & ~31 rsub r7,r4,r7 # c = c - n andi r9,r6,3 # t1 = s & 3 bnei r9,d_block_unaligned # if temp != 0, unaligned transfers needed d_block_aligned: addi r6,r6,-32 # s = s - 32 addi r5,r5,-32 # d = d - 32 lwi r9,r6,28 # t1 = *(s + 28) lwi r10,r6,24 # t2 = *(s + 24) lwi r11,r6,20 # t3 = *(s + 20) lwi r12,r6,16 # t4 = *(s + 16) swi r9,r5,28 # *(d + 28) = t1 swi r10,r5,24 # *(d + 24) = t2 swi r11,r5,20 # *(d + 20) = t3 swi r12,r5,16 # *(d + 16) = t4 lwi r9,r6,12 # t1 = *(s + 12) lwi r10,r6,8 # t2 = *(s + 8) lwi r11,r6,4 # t3 = *(s + 4) lwi r12,r6,0 # t4 = *(s + 0) swi r9,r5,12 # *(d + 12) = t1 swi r10,r5,8 # *(d + 8) = t2 swi r11,r5,4 # *(d + 4) = t3 addi r4,r4,-32 # n = n - 32 bneid r4,d_block_aligned # while (n) loop swi r12,r5,0 # *(d + 0) = t4 (IN DELAY SLOT) bri d_block_done d_block_unaligned: andi r8,r6,0xfffffffc # as = s & ~3 rsub r6,r4,r6 # s = s - n lwi r11,r8,0 # h = *(as + 0) addi r9,r9,-1 beqi r9,d_block_u1 # t1 was 1 => 1 byte offset addi r9,r9,-1 beqi r9,d_block_u2 # t1 was 2 => 2 byte offset d_block_u3: bsrli r11,r11,8 # h = h >> 8 d_bu3_loop: addi r8,r8,-32 # as = as - 32 addi r5,r5,-32 # d = d - 32 lwi r12,r8,28 # v = *(as + 28) bslli r9,r12,24 # t1 = v << 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,28 # *(d + 28) = t1 bsrli r11,r12,8 # h = v >> 8 lwi r12,r8,24 # v = *(as + 24) bslli r9,r12,24 # t1 = v << 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,24 # *(d + 24) = t1 bsrli r11,r12,8 # h = v >> 8 lwi r12,r8,20 # v = *(as + 20) bslli r9,r12,24 # t1 = v << 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,20 # *(d + 20) = t1 bsrli r11,r12,8 # h = v >> 8 lwi r12,r8,16 # v = *(as + 16) bslli r9,r12,24 # t1 = v << 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,16 # *(d + 16) = t1 bsrli r11,r12,8 # h = v >> 8 lwi r12,r8,12 # v = *(as + 12) bslli r9,r12,24 # t1 = v << 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,12 # *(d + 112) = t1 bsrli r11,r12,8 # h = v >> 8 lwi r12,r8,8 # v = *(as + 8) bslli r9,r12,24 # t1 = v << 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,8 # *(d + 8) = t1 bsrli r11,r12,8 # h = v >> 8 lwi r12,r8,4 # v = *(as + 4) bslli r9,r12,24 # t1 = v << 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,4 # *(d + 4) = t1 bsrli r11,r12,8 # h = v >> 8 lwi r12,r8,0 # v = *(as + 0) bslli r9,r12,24 # t1 = v << 24 or r9,r11,r9 # t1 = h | t1 swi r9,r5,0 # *(d + 0) = t1 addi r4,r4,-32 # n = n - 32 bneid r4,d_bu3_loop # while (n) loop bsrli r11,r12,8 # h = v >> 8 (IN DELAY SLOT) bri d_block_done d_block_u1: bsrli r11,r11,24 # h = h >> 24 d_bu1_loop: addi r8,r8,-32 # as = as - 32 addi r5,r5,-32 # d = d - 32 lwi r12,r8,28 # v = *(as + 28) bslli r9,r12,8 # t1 = v << 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,28 # *(d + 28) = t1 bsrli r11,r12,24 # h = v >> 24 lwi r12,r8,24 # v = *(as + 24) bslli r9,r12,8 # t1 = v << 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,24 # *(d + 24) = t1 bsrli r11,r12,24 # h = v >> 24 lwi r12,r8,20 # v = *(as + 20) bslli r9,r12,8 # t1 = v << 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,20 # *(d + 20) = t1 bsrli r11,r12,24 # h = v >> 24 lwi r12,r8,16 # v = *(as + 16) bslli r9,r12,8 # t1 = v << 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,16 # *(d + 16) = t1 bsrli r11,r12,24 # h = v >> 24 lwi r12,r8,12 # v = *(as + 12) bslli r9,r12,8 # t1 = v << 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,12 # *(d + 112) = t1 bsrli r11,r12,24 # h = v >> 24 lwi r12,r8,8 # v = *(as + 8) bslli r9,r12,8 # t1 = v << 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,8 # *(d + 8) = t1 bsrli r11,r12,24 # h = v >> 24 lwi r12,r8,4 # v = *(as + 4) bslli r9,r12,8 # t1 = v << 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,4 # *(d + 4) = t1 bsrli r11,r12,24 # h = v >> 24 lwi r12,r8,0 # v = *(as + 0) bslli r9,r12,8 # t1 = v << 8 or r9,r11,r9 # t1 = h | t1 swi r9,r5,0 # *(d + 0) = t1 addi r4,r4,-32 # n = n - 32 bneid r4,d_bu1_loop # while (n) loop bsrli r11,r12,24 # h = v >> 24 (IN DELAY SLOT) bri d_block_done d_block_u2: bsrli r11,r11,16 # h = h >> 16 d_bu2_loop: addi r8,r8,-32 # as = as - 32 addi r5,r5,-32 # d = d - 32 lwi r12,r8,28 # v = *(as + 28) bslli r9,r12,16 # t1 = v << 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,28 # *(d + 28) = t1 bsrli r11,r12,16 # h = v >> 16 lwi r12,r8,24 # v = *(as + 24) bslli r9,r12,16 # t1 = v << 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,24 # *(d + 24) = t1 bsrli r11,r12,16 # h = v >> 16 lwi r12,r8,20 # v = *(as + 20) bslli r9,r12,16 # t1 = v << 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,20 # *(d + 20) = t1 bsrli r11,r12,16 # h = v >> 16 lwi r12,r8,16 # v = *(as + 16) bslli r9,r12,16 # t1 = v << 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,16 # *(d + 16) = t1 bsrli r11,r12,16 # h = v >> 16 lwi r12,r8,12 # v = *(as + 12) bslli r9,r12,16 # t1 = v << 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,12 # *(d + 112) = t1 bsrli r11,r12,16 # h = v >> 16 lwi r12,r8,8 # v = *(as + 8) bslli r9,r12,16 # t1 = v << 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,8 # *(d + 8) = t1 bsrli r11,r12,16 # h = v >> 16 lwi r12,r8,4 # v = *(as + 4) bslli r9,r12,16 # t1 = v << 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,4 # *(d + 4) = t1 bsrli r11,r12,16 # h = v >> 16 lwi r12,r8,0 # v = *(as + 0) bslli r9,r12,16 # t1 = v << 16 or r9,r11,r9 # t1 = h | t1 swi r9,r5,0 # *(d + 0) = t1 addi r4,r4,-32 # n = n - 32 bneid r4,d_bu2_loop # while (n) loop bsrli r11,r12,16 # h = v >> 16 (IN DELAY SLOT) d_block_done: addi r4,r0,4 # n = 4 cmpu r4,r4,r7 # n = c - n (unsigned) blti r4,d_xfer_end # if n < 0, less than one word to transfer d_word_xfer: andi r4,r7,0xfffffffc # n = c & ~3 rsub r5,r4,r5 # d = d - n rsub r6,r4,r6 # s = s - n rsub r7,r4,r7 # c = c - n andi r9,r6,3 # t1 = s & 3 bnei r9,d_word_unaligned # if temp != 0, unaligned transfers needed d_word_aligned: addi r4,r4,-4 # n-- lw r9,r6,r4 # t1 = *(s+n) bneid r4,d_word_aligned # loop sw r9,r5,r4 # *(d+n) = t1 (IN DELAY SLOT) bri d_word_done d_word_unaligned: andi r8,r6,0xfffffffc # as = s & ~3 lw r11,r8,r4 # h = *(as + n) addi r9,r9,-1 beqi r9,d_word_u1 # t1 was 1 => 1 byte offset addi r9,r9,-1 beqi r9,d_word_u2 # t1 was 2 => 2 byte offset d_word_u3: bsrli r11,r11,8 # h = h >> 8 d_wu3_loop: addi r4,r4,-4 # n = n - 4 lw r12,r8,r4 # v = *(as + n) bslli r9,r12,24 # t1 = v << 24 or r9,r11,r9 # t1 = h | t1 sw r9,r5,r4 # *(d + n) = t1 bneid r4,d_wu3_loop # while (n) loop bsrli r11,r12,8 # h = v >> 8 (IN DELAY SLOT) bri d_word_done d_word_u1: bsrli r11,r11,24 # h = h >> 24 d_wu1_loop: addi r4,r4,-4 # n = n - 4 lw r12,r8,r4 # v = *(as + n) bslli r9,r12,8 # t1 = v << 8 or r9,r11,r9 # t1 = h | t1 sw r9,r5,r4 # *(d + n) = t1 bneid r4,d_wu1_loop # while (n) loop bsrli r11,r12,24 # h = v >> 24 (IN DELAY SLOT) bri d_word_done d_word_u2: bsrli r11,r11,16 # h = h >> 16 d_wu2_loop: addi r4,r4,-4 # n = n - 4 lw r12,r8,r4 # v = *(as + n) bslli r9,r12,16 # t1 = v << 16 or r9,r11,r9 # t1 = h | t1 sw r9,r5,r4 # *(d + n) = t1 bneid r4,d_wu2_loop # while (n) loop bsrli r11,r12,16 # h = v >> 16 (IN DELAY SLOT) d_word_done: d_xfer_end: d_xfer_end_loop: beqi r7,a_done # while (c) addi r6,r6,-1 # s-- lbui r9,r6,0 # t1 = *s addi r5,r5,-1 # d-- sbi r9,r5,0 # *d = t1 brid d_xfer_end_loop # loop addi r7,r7,-1 # c-- (IN DELAY SLOT) d_done: rtsd r15,8 nop .end C_SYMBOL_NAME(memmove)