[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [microblaze-uclinux] xenet_FifoSend(struct sk_buff *orig_skb,structnet_device *dev)



... updated version with more optimized main loop - just getting used to this MicroBlaze thingie <G>

Jim Law


____________________________________________________

Hi,

I've been following the discussion on speeding up the ethernet with
interest.  I had a look at the do_csum() routine in the
arch/microblaze/lib/checksum.c file and produced an assembler version in the
hopes of providing some speedup.

I'm not familiar with how to best fold this into the normal code base - I
just commented out the do_csum in the c file and added assly_csum.o to the
makefile in that directory.

I've attached the assly_csum.S file to this message.

I'd be interested if the 100us checksum calculation in your test case below
is changed much by using this optimization.

Jim Law
Iris Power LP

###################################-*-asm*- # # Copyright 2008 (c) Jim Law - Iris LP All rights reserved. # # This file is subject to the terms and conditions of the GNU General
# Public License.  See the file COPYING in the main directory of this
# archive for more details.
#
# Written by Jim Law <jlaw@xxxxxxxxxxxxx>
# # intended to replace the do_csum in checksum.c in arch/microblaze/lib
#
#
# assly_csum.s # # Attempt at quicker checksum for ethernet
#	Input :	Operand1 in Reg r5 - starting address of buffer
#		Operand2 in Reg r6 - number of bytes to perform checksum on		
#	Output: Result in Reg r3 - checksum 16
#			
# # Explanation:
# 	Perform modulo 16 bit checksum on a (possibly unaligned)
#	big-endian buffer of size spec'd in bytes
#
#
#######################################

#include <asm/clinkage.h>

	.globl	C_SYMBOL_NAME(do_csum)
	.ent	C_SYMBOL_NAME(do_csum)

C_SYMBOL_NAME(do_csum):

	beqid	r6,1f		# if num of bytes is zero, return with zero csum
	addik	r3,r0,0		# clear return csum - IN DELAY SLOT

	andi	r7,r5,0xfffffffc 	# calc buffer word address, implied sign extend
	
	add	r8,r5,r6	# end address = buff address + num of bytes
	addi	r8,r8,-1	# end address = end address - 1

	andi	r4,r5,3		# temp = buff address & 3
	bslli	r4,r4,3		# temp = temp * 8
	xori	r9,r0,0xffffffff	# r9 = 0xffffffff, implied sign extend
	bsrl	r9,r9,r4	# startmask = r9 >> temp

	andi	r4,r8,3		# temp = end address & 3
	bslli	r4,r4,3		# temp = temp * 8
	rsubi	r4,r4,24	# temp = 24 - temp
	xori	r10,r0,0xffffffff	# r10 = 0xffffffff, implied sign extend
	bsll	r10,r10,r4	# startmask = r9 << temp

	bsrli	r4,r5,2		# temp = buff address >> 2
	bsrli	r11,r8,2	# word count = end address >> 2
	rsub	r11,r4,r11	# word count = word count - temp

	# add in the first word, appropriately masked
	lwi	r3,r7,0		# csum = *word address
	and	r3,r3,r9	# csum = csum & startmask
	bneid	r11,2f		# if word count != 0, go do more than one word sum
	addi	r7,r7,4		# word address++ - IN DELAY SLOT	

	# when get here, then all bytes to be summed are in one word
	brid 	3f		# goto sum the half-words
	and	r3,r3,r10	# csum = csum & endmask - IN DELAY SLOT

2:	# when get here, then more than one word to be summed
	addi	r11,r11,-1	# word count = word count - 1
	addi	r0,r0,0		# clear carry for add with carry in loop

5:	beqi	r11,4f		# if no more words to do, leave loop
	lwi	r4,r7,0		# temp = *word address	
	addc	r3,r3,r4	# csum = csum + temp + carry
	addik	r11,r11,-1	# word count = word count - 1, don't disturb carry
	brid	5b
	addik	r7,r7,4		# word address++, don't disturb carry - IN DELAY SLOT	

4:	# deal with last (possibly partial) word
	lwi	r4,r7,0		# temp = *word address	
	and	r4,r4,r10	# temp = temp & endmask
	addc	r3,r3,r4	# csum = csum + temp, include carry 		

3:	# sum the halfwords in the result
	bsrli	r4,r3,16	# temp = csum >> 16
	andi	r3,r3,0x0000ffff 	# csum = csum & 0x0000ffff, need .imm here to override sign ext.
	addc	r3,r3,r4	# csum = csum + temp + carry, this might have carried out of ls half-word
	# .. so add the high half-word back in again
	bsrli	r4,r3,16	# temp = csum >> 16
	andi	r3,r3,0x0000ffff 	# csum = csum & 0x0000ffff, need .imm here to override sign ext.
	add	r3,r3,r4	# csum = csum + temp, this will never carry out of ls half-word

	andi	r4,r5,1		# temp = buff address & 1, check if high/low bytes need to be swapped
	beqi	r4,1f		# started on half-word boundary, ok to not swap

	# swap the high / low bytes in the 16 bit csum
	bsrli	r4,r3,8		# temp = csum >> 8, no high bits on, so no need to mask
	andi	r3,r3,0x000000ff	# csum = csum & 0xff, implied sign extend ok
	bslli	r3,r3,8		# csum = csum << 8
	or	r3,r3,r4	# csum = csum | temp

	# Restore Frame and return	
1:	rtsd	r15,8
	nop

.end C_SYMBOL_NAME(do_csum)