[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[microblaze-uclinux] MicroBlaze fast memcpy / memmove



Hi all,
 
I took some suggestions from Goran Bilski and the trials that Falk had been doing trying to speed up the Ethernet, and built assembler memcpy and memmove functions that incorporate some loop unrolling, etc. to try and get uniformly faster moves and copies.
 
General code flow for both ascending and descending transfers is:
1. Transfer 1-byte per loop to get to an aligned destination address.
2. Transfer 8-word blocks per loop if at least 8 words left to transfer, using unrolled loops per Goran's suggestions.
3. Transfer 1-word per loop if at least 1 word left to transfer.
4. Transfer 1-byte per loop to finish.
 
I kept the ascending move (slightly slower than descending move) for the memcpy function, just in case anyone is dependant on this behaviour. If anyone with more experience thinks (knows?) that the direction of memcpy's operation is irrelevant to every program ever written <G>, then i can do a quick change and get slightly better performance on memcpy moves.
 
Attached is the fastcopy.S file that I place and build in /arch/microblaze/lib.  I just commented out the C versions of memcpy and memmove in their respective .c files in the same directory.
 
Obviously this change won't make a day and night difference on small moves, but might be noticeable on larger blocks of data.
 
Jim Law
Iris Power LP
 
 
Please note that Iris Power LP Canada has relocated its Toronto operation to 3110 American Drive, Mississauga L4V 1T2, right next to the Toronto airport. Our phone and fax numbers remain the same, as does the level of service we offer our clients.  For further details please visit our website at www.irispower.com
###################################-*-asm*- 
# 
# Copyright 2008 (c) Jim Law - Iris LP  All rights reserved. 
# 
# This file is subject to the terms and conditions of the GNU General
# Public License.  See the file COPYING in the main directory of this
# archive for more details.
#
# Written by Jim Law <jlaw@xxxxxxxxxxxxx>
#  
# intended to replace:
#	memcpy in memcpy.c and
# 	memmove in memmove.c
# ... in arch/microblaze/lib
# 
#
# assly_fastcopy.S 
# 
# Attempt at quicker memcpy and memmove for MicroBlaze
#	Input :	Operand1 in Reg r5 - destination address
#		Operand2 in Reg r6 - source address
#		Operand3 in Reg r7 - number of bytes to transfer
#	Output: Result in Reg r3 - starting destinaition address
#			
# 
# Explanation:
# 	Perform (possibly unaligned) copy of a block of memory
#	between mem locations with size of xfer spec'd in bytes
#	
#
#######################################

#include <asm/clinkage.h>

	.globl	C_SYMBOL_NAME(memcpy)
	.ent	C_SYMBOL_NAME(memcpy)
	

C_SYMBOL_NAME(memcpy):
fast_memcpy_ascending:

	addi	r3,r5,0		# move d to return register as value of function

	addi	r4,r0,4		# n = 4
	cmpu	r4,r4,r7	# n = c - n  (unsigned)
	blti	r4,a_xfer_end		# if n < 0, less than one word to transfer

	# transfer first 0~3 bytes to get aligned dest address
	andi	r4,r5,3		# n = d & 3
	beqi	r4,a_dalign_done		# if zero, destination already aligned	
	rsubi	r4,r4,4		# n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset)
	rsub	r7,r4,r7		# c = c - n adjust c

a_xfer_first_loop:
	beqi	r4,a_dalign_done		# if no bytes left to transfer, transfer the bulk
	lbui	r11,r6,0	# h = *s
	sbi	r11,r5,0	# *d = h
	addi	r6,r6,1		# s++
	addi	r5,r5,1		# d++
	brid	a_xfer_first_loop		# loop
	addi	r4,r4,-1	# n-- (IN DELAY SLOT)

a_dalign_done:
	addi	r4,r0,32		# n = 32
	cmpu	r4,r4,r7	# n = c - n  (unsigned)
	blti	r4,a_block_done		# if n < 0, less than one block to transfer
	
a_block_xfer:
	andi	r4,r7,0xffffffe0	# n = c & ~31
	rsub	r7,r4,r7		# c = c - n

	andi	r9,r6,3		# t1 = s & 3
	bnei	r9,a_block_unaligned		# if temp != 0, unaligned transfers needed
	
a_block_aligned:
	lwi	r9,r6,0		# t1 = *(s + 0)
	lwi	r10,r6,4		# t2 = *(s + 4)
	lwi	r11,r6,8		# t3 = *(s + 8)
	lwi	r12,r6,12	# t4 = *(s + 12)
	swi	r9,r5,0		# *(d + 0) = t1
	swi	r10,r5,4		# *(d + 4) = t2
	swi	r11,r5,8		# *(d + 8) = t3
	swi	r12,r5,12	# *(d + 12) = t4
	lwi	r9,r6,16		# t1 = *(s + 16)
	lwi	r10,r6,20	# t2 = *(s + 20)
	lwi	r11,r6,24	# t3 = *(s + 24)
	lwi	r12,r6,28	# t4 = *(s + 28)
	swi	r9,r5,16		# *(d + 16) = t1
	swi	r10,r5,20	# *(d + 20) = t2
	swi	r11,r5,24	# *(d + 24) = t3
	swi	r12,r5,28	# *(d + 28) = t4
	addi	r6,r6,32		# s = s + 32
	addi	r4,r4,-32	# n = n - 32
	bneid	r4,a_block_aligned	# while (n) loop
	addi	r5,r5,32		# d = d + 32 (IN DELAY SLOT)
	bri	a_block_done
	
a_block_unaligned:
	andi	r8,r6,0xfffffffc	# as = s & ~3
	add	r6,r6,r4		# s = s + n
	lwi	r11,r8,0		# h = *(as + 0)
	
	addi	r9,r9,-1	
	beqi	r9,a_block_u1		# t1 was 1 => 1 byte offset
	addi	r9,r9,-1
	beqi	r9,a_block_u2		# t1 was 2 => 2 byte offset

a_block_u3:
	bslli	r11,r11,24	# h = h << 24
a_bu3_loop:
	lwi	r12,r8,4		# v = *(as + 4)
	bsrli	r9,r12,8	# t1 = v >> 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,0	# *(d + 0) = t1
	bslli	r11,r12,24	# h = v << 24
	lwi	r12,r8,8		# v = *(as + 8)
	bsrli	r9,r12,8	# t1 = v >> 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,4	# *(d + 4) = t1
	bslli	r11,r12,24	# h = v << 24
	lwi	r12,r8,12		# v = *(as + 12)
	bsrli	r9,r12,8	# t1 = v >> 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,8	# *(d + 8) = t1
	bslli	r11,r12,24	# h = v << 24
	lwi	r12,r8,16		# v = *(as + 16)
	bsrli	r9,r12,8	# t1 = v >> 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,12	# *(d + 12) = t1
	bslli	r11,r12,24	# h = v << 24
	lwi	r12,r8,20		# v = *(as + 20)
	bsrli	r9,r12,8	# t1 = v >> 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,16	# *(d + 16) = t1
	bslli	r11,r12,24	# h = v << 24
	lwi	r12,r8,24		# v = *(as + 24)
	bsrli	r9,r12,8	# t1 = v >> 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,20	# *(d + 20) = t1
	bslli	r11,r12,24	# h = v << 24
	lwi	r12,r8,28		# v = *(as + 28)
	bsrli	r9,r12,8	# t1 = v >> 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,24	# *(d + 24) = t1
	bslli	r11,r12,24	# h = v << 24
	lwi	r12,r8,32		# v = *(as + 32)
	bsrli	r9,r12,8	# t1 = v >> 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,28	# *(d + 28) = t1
	bslli	r11,r12,24	# h = v << 24
	addi	r8,r8,32		# as = as + 32
	addi	r4,r4,-32	# n = n - 32
	bneid	r4,a_bu3_loop	# while (n) loop
	addi	r5,r5,32		# d = d + 32 (IN DELAY SLOT)
	bri	a_block_done 

a_block_u1:
	bslli	r11,r11,8	# h = h << 8
a_bu1_loop:
	lwi	r12,r8,4		# v = *(as + 4)
	bsrli	r9,r12,24	# t1 = v >> 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,0	# *(d + 0) = t1
	bslli	r11,r12,8	# h = v << 8
	lwi	r12,r8,8		# v = *(as + 8)
	bsrli	r9,r12,24	# t1 = v >> 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,4	# *(d + 4) = t1
	bslli	r11,r12,8	# h = v << 8
	lwi	r12,r8,12		# v = *(as + 12)
	bsrli	r9,r12,24	# t1 = v >> 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,8	# *(d + 8) = t1
	bslli	r11,r12,8	# h = v << 8
	lwi	r12,r8,16		# v = *(as + 16)
	bsrli	r9,r12,24	# t1 = v >> 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,12	# *(d + 12) = t1
	bslli	r11,r12,8	# h = v << 8
	lwi	r12,r8,20		# v = *(as + 20)
	bsrli	r9,r12,24	# t1 = v >> 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,16	# *(d + 16) = t1
	bslli	r11,r12,8	# h = v << 8
	lwi	r12,r8,24		# v = *(as + 24)
	bsrli	r9,r12,24	# t1 = v >> 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,20	# *(d + 20) = t1
	bslli	r11,r12,8	# h = v << 8
	lwi	r12,r8,28		# v = *(as + 28)
	bsrli	r9,r12,24	# t1 = v >> 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,24	# *(d + 24) = t1
	bslli	r11,r12,8	# h = v << 8
	lwi	r12,r8,32		# v = *(as + 32)
	bsrli	r9,r12,24	# t1 = v >> 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,28	# *(d + 28) = t1
	bslli	r11,r12,8	# h = v << 8
	addi	r8,r8,32		# as = as + 32
	addi	r4,r4,-32	# n = n - 32
	bneid	r4,a_bu1_loop	# while (n) loop
	addi	r5,r5,32		# d = d + 32 (IN DELAY SLOT)
	bri	a_block_done

a_block_u2:
	bslli	r11,r11,16	# h = h << 16
a_bu2_loop:
	lwi	r12,r8,4		# v = *(as + 4)
	bsrli	r9,r12,16	# t1 = v >> 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,0	# *(d + 0) = t1
	bslli	r11,r12,16	# h = v << 16
	lwi	r12,r8,8		# v = *(as + 8)
	bsrli	r9,r12,16	# t1 = v >> 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,4	# *(d + 4) = t1
	bslli	r11,r12,16	# h = v << 16
	lwi	r12,r8,12		# v = *(as + 12)
	bsrli	r9,r12,16	# t1 = v >> 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,8	# *(d + 8) = t1
	bslli	r11,r12,16	# h = v << 16
	lwi	r12,r8,16		# v = *(as + 16)
	bsrli	r9,r12,16	# t1 = v >> 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,12	# *(d + 12) = t1
	bslli	r11,r12,16	# h = v << 16
	lwi	r12,r8,20		# v = *(as + 20)
	bsrli	r9,r12,16	# t1 = v >> 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,16	# *(d + 16) = t1
	bslli	r11,r12,16	# h = v << 16
	lwi	r12,r8,24		# v = *(as + 24)
	bsrli	r9,r12,16	# t1 = v >> 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,20	# *(d + 20) = t1
	bslli	r11,r12,16	# h = v << 16
	lwi	r12,r8,28		# v = *(as + 28)
	bsrli	r9,r12,16	# t1 = v >> 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,24	# *(d + 24) = t1
	bslli	r11,r12,16	# h = v << 16
	lwi	r12,r8,32		# v = *(as + 32)
	bsrli	r9,r12,16	# t1 = v >> 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,28	# *(d + 28) = t1
	bslli	r11,r12,16	# h = v << 16
	addi	r8,r8,32		# as = as + 32
	addi	r4,r4,-32	# n = n - 32
	bneid	r4,a_bu2_loop	# while (n) loop
	addi	r5,r5,32		# d = d + 32 (IN DELAY SLOT)

a_block_done:
	addi	r4,r0,4		# n = 4
	cmpu	r4,r4,r7	# n = c - n  (unsigned)
	blti	r4,a_xfer_end		# if n < 0, less than one word to transfer

a_word_xfer:
	andi	r4,r7,0xfffffffc	# n = c & ~3
	addi	r10,r0,0		# offset = 0

	andi	r9,r6,3		# t1 = s & 3
	bnei	r9,a_word_unaligned		# if temp != 0, unaligned transfers needed
	
a_word_aligned:
	lw	r9,r6,r10	# t1 = *(s+offset)
	sw	r9,r5,r10	# *(d+offset) = t1
	addi	r4,r4,-4	# n--
	bneid	r4,a_word_aligned		# loop
	addi	r10,r10,4	# offset++ (IN DELAY SLOT)
	
	bri	a_word_done
	
a_word_unaligned:
	andi	r8,r6,0xfffffffc	# as = s & ~3
	lwi	r11,r8,0		# h = *(as + 0)
	addi	r8,r8,4		# as = as + 4
	
	addi	r9,r9,-1	
	beqi	r9,a_word_u1		# t1 was 1 => 1 byte offset
	addi	r9,r9,-1
	beqi	r9,a_word_u2		# t1 was 2 => 2 byte offset

a_word_u3:
	bslli	r11,r11,24	# h = h << 24
a_wu3_loop:
	lw	r12,r8,r10		# v = *(as + offset)
	bsrli	r9,r12,8	# t1 = v >> 8
	or	r9,r11,r9	# t1 = h | t1
	sw	r9,r5,r10	# *(d + offset) = t1
	bslli	r11,r12,24	# h = v << 24
	addi	r4,r4,-4	# n = n - 4
	bneid	r4,a_wu3_loop	# while (n) loop
	addi	r10,r10,4		# offset = ofset + 4 (IN DELAY SLOT)

	bri	a_word_done

a_word_u1:
	bslli	r11,r11,8	# h = h << 8
a_wu1_loop:
	lw	r12,r8,r10		# v = *(as + offset)
	bsrli	r9,r12,24	# t1 = v >> 24
	or	r9,r11,r9	# t1 = h | t1
	sw	r9,r5,r10	# *(d + offset) = t1
	bslli	r11,r12,8	# h = v << 8
	addi	r4,r4,-4	# n = n - 4
	bneid	r4,a_wu1_loop	# while (n) loop
	addi	r10,r10,4		# offset = ofset + 4 (IN DELAY SLOT)

	bri	a_word_done

a_word_u2:
	bslli	r11,r11,16	# h = h << 16
a_wu2_loop:
	lw	r12,r8,r10		# v = *(as + offset)
	bsrli	r9,r12,16	# t1 = v >> 16
	or	r9,r11,r9	# t1 = h | t1
	sw	r9,r5,r10	# *(d + offset) = t1
	bslli	r11,r12,16	# h = v << 16
	addi	r4,r4,-4	# n = n - 4
	bneid	r4,a_wu2_loop	# while (n) loop
	addi	r10,r10,4		# offset = ofset + 4 (IN DELAY SLOT)

a_word_done:
	add	r5,r5,r10	# d = d + offset
	add	r6,r6,r10	# s = s + offset
	rsub	r7,r10,r7	# c = c - offset

a_xfer_end:
a_xfer_end_loop:
	beqi	r7,a_done	# while (c)
	lbui	r9,r6,0	# t1 = *s
	addi	r6,r6,1		# s++
	sbi	r9,r5,0	# *d = t1
	addi	r7,r7,-1	# c--
	brid	a_xfer_end_loop		# loop
	addi	r5,r5,1		# d++ (IN DELAY SLOT)
	
a_done:
	rtsd	r15,8
	nop
	
.end C_SYMBOL_NAME(memcpy)

###############################################################
	.globl	C_SYMBOL_NAME(memmove)
	.ent	C_SYMBOL_NAME(memmove)

C_SYMBOL_NAME(memmove):

	cmpu	r4,r5,r6	# n = s - d
	bgei	r4,fast_memcpy_ascending

fast_memcpy_descending:

	addi	r3,r5,0		# move d to return register as value of function
	
	add	r5,r5,r7		# d = d + c
	add	r6,r6,r7		# s = s + c

	addi	r4,r0,4		# n = 4
	cmpu	r4,r4,r7	# n = c - n  (unsigned)
	blti	r4,d_xfer_end		# if n < 0, less than one word to transfer

	# transfer first 0~3 bytes to get aligned dest address
	andi	r4,r5,3		# n = d & 3
	beqi	r4,d_dalign_done		# if zero, destination already aligned	
	rsub	r7,r4,r7		# c = c - n adjust c

d_xfer_first_loop:
	beqi	r4,d_dalign_done		# if no bytes left to transfer, transfer the bulk
	addi	r6,r6,-1		# s--
	addi	r5,r5,-1		# d--
	lbui	r11,r6,0	# h = *s
	sbi	r11,r5,0	# *d = h
	brid	d_xfer_first_loop		# loop
	addi	r4,r4,-1	# n-- (IN DELAY SLOT)

d_dalign_done:
	addi	r4,r0,32		# n = 32
	cmpu	r4,r4,r7	# n = c - n  (unsigned)
	blti	r4,d_block_done		# if n < 0, less than one block to transfer
	
d_block_xfer:
	andi	r4,r7,0xffffffe0	# n = c & ~31
	rsub	r7,r4,r7		# c = c - n

	andi	r9,r6,3		# t1 = s & 3
	bnei	r9,d_block_unaligned		# if temp != 0, unaligned transfers needed
	
d_block_aligned:
	addi	r6,r6,-32	# s = s - 32
	addi	r5,r5,-32	# d = d - 32
	lwi	r9,r6,28		# t1 = *(s + 28)
	lwi	r10,r6,24	# t2 = *(s + 24)
	lwi	r11,r6,20	# t3 = *(s + 20)
	lwi	r12,r6,16	# t4 = *(s + 16)
	swi	r9,r5,28		# *(d + 28) = t1
	swi	r10,r5,24	# *(d + 24) = t2
	swi	r11,r5,20	# *(d + 20) = t3
	swi	r12,r5,16	# *(d + 16) = t4
	lwi	r9,r6,12		# t1 = *(s + 12)
	lwi	r10,r6,8		# t2 = *(s + 8)
	lwi	r11,r6,4		# t3 = *(s + 4)
	lwi	r12,r6,0		# t4 = *(s + 0)
	swi	r9,r5,12		# *(d + 12) = t1
	swi	r10,r5,8		# *(d + 8) = t2
	swi	r11,r5,4		# *(d + 4) = t3
	addi	r4,r4,-32	# n = n - 32
	bneid	r4,d_block_aligned	# while (n) loop
	swi	r12,r5,0		# *(d + 0) = t4 (IN DELAY SLOT)
	bri	d_block_done 
	
d_block_unaligned:
	andi	r8,r6,0xfffffffc	# as = s & ~3
	rsub	r6,r4,r6		# s = s - n
	lwi	r11,r8,0		# h = *(as + 0)
	
	addi	r9,r9,-1	
	beqi	r9,d_block_u1		# t1 was 1 => 1 byte offset
	addi	r9,r9,-1
	beqi	r9,d_block_u2		# t1 was 2 => 2 byte offset

d_block_u3:
	bsrli	r11,r11,8	# h = h >> 8
d_bu3_loop:
	addi	r8,r8,-32		# as = as - 32
	addi	r5,r5,-32		# d = d - 32
	lwi	r12,r8,28		# v = *(as + 28)
	bslli	r9,r12,24	# t1 = v << 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,28	# *(d + 28) = t1
	bsrli	r11,r12,8	# h = v >> 8
	lwi	r12,r8,24		# v = *(as + 24)
	bslli	r9,r12,24	# t1 = v << 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,24	# *(d + 24) = t1
	bsrli	r11,r12,8	# h = v >> 8
	lwi	r12,r8,20		# v = *(as + 20)
	bslli	r9,r12,24	# t1 = v << 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,20	# *(d + 20) = t1
	bsrli	r11,r12,8	# h = v >> 8
	lwi	r12,r8,16		# v = *(as + 16)
	bslli	r9,r12,24	# t1 = v << 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,16	# *(d + 16) = t1
	bsrli	r11,r12,8	# h = v >> 8
	lwi	r12,r8,12		# v = *(as + 12)
	bslli	r9,r12,24	# t1 = v << 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,12	# *(d + 112) = t1
	bsrli	r11,r12,8	# h = v >> 8
	lwi	r12,r8,8		# v = *(as + 8)
	bslli	r9,r12,24	# t1 = v << 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,8	# *(d + 8) = t1
	bsrli	r11,r12,8	# h = v >> 8
	lwi	r12,r8,4		# v = *(as + 4)
	bslli	r9,r12,24	# t1 = v << 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,4	# *(d + 4) = t1
	bsrli	r11,r12,8	# h = v >> 8
	lwi	r12,r8,0		# v = *(as + 0)
	bslli	r9,r12,24	# t1 = v << 24
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,0	# *(d + 0) = t1
	addi	r4,r4,-32	# n = n - 32
	bneid	r4,d_bu3_loop	# while (n) loop
	bsrli	r11,r12,8	# h = v >> 8 (IN DELAY SLOT)
	bri	d_block_done

d_block_u1:
	bsrli	r11,r11,24	# h = h >> 24
d_bu1_loop:
	addi	r8,r8,-32		# as = as - 32
	addi	r5,r5,-32		# d = d - 32
	lwi	r12,r8,28		# v = *(as + 28)
	bslli	r9,r12,8	# t1 = v << 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,28	# *(d + 28) = t1
	bsrli	r11,r12,24	# h = v >> 24
	lwi	r12,r8,24		# v = *(as + 24)
	bslli	r9,r12,8	# t1 = v << 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,24	# *(d + 24) = t1
	bsrli	r11,r12,24	# h = v >> 24
	lwi	r12,r8,20		# v = *(as + 20)
	bslli	r9,r12,8	# t1 = v << 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,20	# *(d + 20) = t1
	bsrli	r11,r12,24	# h = v >> 24
	lwi	r12,r8,16		# v = *(as + 16)
	bslli	r9,r12,8	# t1 = v << 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,16	# *(d + 16) = t1
	bsrli	r11,r12,24	# h = v >> 24
	lwi	r12,r8,12		# v = *(as + 12)
	bslli	r9,r12,8	# t1 = v << 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,12	# *(d + 112) = t1
	bsrli	r11,r12,24	# h = v >> 24
	lwi	r12,r8,8		# v = *(as + 8)
	bslli	r9,r12,8	# t1 = v << 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,8	# *(d + 8) = t1
	bsrli	r11,r12,24	# h = v >> 24
	lwi	r12,r8,4		# v = *(as + 4)
	bslli	r9,r12,8	# t1 = v << 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,4	# *(d + 4) = t1
	bsrli	r11,r12,24	# h = v >> 24
	lwi	r12,r8,0		# v = *(as + 0)
	bslli	r9,r12,8	# t1 = v << 8
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,0	# *(d + 0) = t1
	addi	r4,r4,-32	# n = n - 32
	bneid	r4,d_bu1_loop	# while (n) loop
	bsrli	r11,r12,24	# h = v >> 24 (IN DELAY SLOT)
	bri	d_block_done

d_block_u2:
	bsrli	r11,r11,16	# h = h >> 16
d_bu2_loop:
	addi	r8,r8,-32		# as = as - 32
	addi	r5,r5,-32		# d = d - 32
	lwi	r12,r8,28		# v = *(as + 28)
	bslli	r9,r12,16	# t1 = v << 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,28	# *(d + 28) = t1
	bsrli	r11,r12,16	# h = v >> 16
	lwi	r12,r8,24		# v = *(as + 24)
	bslli	r9,r12,16	# t1 = v << 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,24	# *(d + 24) = t1
	bsrli	r11,r12,16	# h = v >> 16
	lwi	r12,r8,20		# v = *(as + 20)
	bslli	r9,r12,16	# t1 = v << 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,20	# *(d + 20) = t1
	bsrli	r11,r12,16	# h = v >> 16
	lwi	r12,r8,16		# v = *(as + 16)
	bslli	r9,r12,16	# t1 = v << 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,16	# *(d + 16) = t1
	bsrli	r11,r12,16	# h = v >> 16
	lwi	r12,r8,12		# v = *(as + 12)
	bslli	r9,r12,16	# t1 = v << 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,12	# *(d + 112) = t1
	bsrli	r11,r12,16	# h = v >> 16
	lwi	r12,r8,8		# v = *(as + 8)
	bslli	r9,r12,16	# t1 = v << 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,8	# *(d + 8) = t1
	bsrli	r11,r12,16	# h = v >> 16
	lwi	r12,r8,4		# v = *(as + 4)
	bslli	r9,r12,16	# t1 = v << 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,4	# *(d + 4) = t1
	bsrli	r11,r12,16	# h = v >> 16
	lwi	r12,r8,0		# v = *(as + 0)
	bslli	r9,r12,16	# t1 = v << 16
	or	r9,r11,r9	# t1 = h | t1
	swi	r9,r5,0	# *(d + 0) = t1
	addi	r4,r4,-32	# n = n - 32
	bneid	r4,d_bu2_loop	# while (n) loop
	bsrli	r11,r12,16	# h = v >> 16 (IN DELAY SLOT)

d_block_done:
	addi	r4,r0,4		# n = 4
	cmpu	r4,r4,r7	# n = c - n  (unsigned)
	blti	r4,d_xfer_end		# if n < 0, less than one word to transfer

d_word_xfer:
	andi	r4,r7,0xfffffffc	# n = c & ~3
	rsub	r5,r4,r5		# d = d - n
	rsub	r6,r4,r6		# s = s - n
	rsub	r7,r4,r7		# c = c - n

	andi	r9,r6,3		# t1 = s & 3
	bnei	r9,d_word_unaligned		# if temp != 0, unaligned transfers needed
	
d_word_aligned:
	addi	r4,r4,-4	# n--
	lw	r9,r6,r4		# t1 = *(s+n)
	bneid	r4,d_word_aligned		# loop
	sw	r9,r5,r4		# *(d+n) = t1 (IN DELAY SLOT)
	
	bri	d_word_done
	
d_word_unaligned:
	andi	r8,r6,0xfffffffc	# as = s & ~3
	lw	r11,r8,r4		# h = *(as + n)
	
	addi	r9,r9,-1	
	beqi	r9,d_word_u1		# t1 was 1 => 1 byte offset
	addi	r9,r9,-1
	beqi	r9,d_word_u2		# t1 was 2 => 2 byte offset

d_word_u3:
	bsrli	r11,r11,8	# h = h >> 8
d_wu3_loop:
	addi	r4,r4,-4	# n = n - 4
	lw	r12,r8,r4		# v = *(as + n)
	bslli	r9,r12,24	# t1 = v << 24
	or	r9,r11,r9	# t1 = h | t1
	sw	r9,r5,r4	# *(d + n) = t1
	bneid	r4,d_wu3_loop	# while (n) loop
	bsrli	r11,r12,8	# h = v >> 8 (IN DELAY SLOT)

	bri	d_word_done

d_word_u1:
	bsrli	r11,r11,24	# h = h >> 24
d_wu1_loop:
	addi	r4,r4,-4	# n = n - 4
	lw	r12,r8,r4		# v = *(as + n)
	bslli	r9,r12,8	# t1 = v << 8
	or	r9,r11,r9	# t1 = h | t1
	sw	r9,r5,r4	# *(d + n) = t1
	bneid	r4,d_wu1_loop	# while (n) loop
	bsrli	r11,r12,24	# h = v >> 24 (IN DELAY SLOT)

	bri	d_word_done

d_word_u2:
	bsrli	r11,r11,16	# h = h >> 16
d_wu2_loop:
	addi	r4,r4,-4	# n = n - 4
	lw	r12,r8,r4		# v = *(as + n)
	bslli	r9,r12,16	# t1 = v << 16
	or	r9,r11,r9	# t1 = h | t1
	sw	r9,r5,r4	# *(d + n) = t1
	bneid	r4,d_wu2_loop	# while (n) loop
	bsrli	r11,r12,16	# h = v >> 16 (IN DELAY SLOT)

d_word_done:

d_xfer_end:
d_xfer_end_loop:
	beqi	r7,a_done	# while (c)
	addi	r6,r6,-1		# s--
	lbui	r9,r6,0	# t1 = *s
	addi	r5,r5,-1		# d--
	sbi	r9,r5,0	# *d = t1
	brid	d_xfer_end_loop		# loop
	addi	r7,r7,-1	# c-- (IN DELAY SLOT)
	
d_done:
	rtsd	r15,8
	nop
	
.end C_SYMBOL_NAME(memmove)