[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [microblaze-uclinux] xenet_FifoSend(struct sk_buff *orig_skb,structnet_device *dev)
... updated version with more optimized main loop - just getting used to
this MicroBlaze thingie <G>
Jim Law
____________________________________________________
Hi,
I've been following the discussion on speeding up the ethernet with
interest. I had a look at the do_csum() routine in the
arch/microblaze/lib/checksum.c file and produced an assembler version in the
hopes of providing some speedup.
I'm not familiar with how to best fold this into the normal code base - I
just commented out the do_csum in the c file and added assly_csum.o to the
makefile in that directory.
I've attached the assly_csum.S file to this message.
I'd be interested if the 100us checksum calculation in your test case below
is changed much by using this optimization.
Jim Law
Iris Power LP
###################################-*-asm*-
#
# Copyright 2008 (c) Jim Law - Iris LP All rights reserved.
#
# This file is subject to the terms and conditions of the GNU General
# Public License. See the file COPYING in the main directory of this
# archive for more details.
#
# Written by Jim Law <jlaw@xxxxxxxxxxxxx>
#
# intended to replace the do_csum in checksum.c in arch/microblaze/lib
#
#
# assly_csum.s
#
# Attempt at quicker checksum for ethernet
# Input : Operand1 in Reg r5 - starting address of buffer
# Operand2 in Reg r6 - number of bytes to perform checksum on
# Output: Result in Reg r3 - checksum 16
#
#
# Explanation:
# Perform modulo 16 bit checksum on a (possibly unaligned)
# big-endian buffer of size spec'd in bytes
#
#
#######################################
#include <asm/clinkage.h>
.globl C_SYMBOL_NAME(do_csum)
.ent C_SYMBOL_NAME(do_csum)
C_SYMBOL_NAME(do_csum):
beqid r6,1f # if num of bytes is zero, return with zero csum
addik r3,r0,0 # clear return csum - IN DELAY SLOT
andi r7,r5,0xfffffffc # calc buffer word address, implied sign extend
add r8,r5,r6 # end address = buff address + num of bytes
addi r8,r8,-1 # end address = end address - 1
andi r4,r5,3 # temp = buff address & 3
bslli r4,r4,3 # temp = temp * 8
xori r9,r0,0xffffffff # r9 = 0xffffffff, implied sign extend
bsrl r9,r9,r4 # startmask = r9 >> temp
andi r4,r8,3 # temp = end address & 3
bslli r4,r4,3 # temp = temp * 8
rsubi r4,r4,24 # temp = 24 - temp
xori r10,r0,0xffffffff # r10 = 0xffffffff, implied sign extend
bsll r10,r10,r4 # startmask = r9 << temp
bsrli r4,r5,2 # temp = buff address >> 2
bsrli r11,r8,2 # word count = end address >> 2
rsub r11,r4,r11 # word count = word count - temp
# add in the first word, appropriately masked
lwi r3,r7,0 # csum = *word address
and r3,r3,r9 # csum = csum & startmask
bneid r11,2f # if word count != 0, go do more than one word sum
addi r7,r7,4 # word address++ - IN DELAY SLOT
# when get here, then all bytes to be summed are in one word
brid 3f # goto sum the half-words
and r3,r3,r10 # csum = csum & endmask - IN DELAY SLOT
2: # when get here, then more than one word to be summed
addi r11,r11,-1 # word count = word count - 1
addi r0,r0,0 # clear carry for add with carry in loop
5: beqi r11,4f # if no more words to do, leave loop
lwi r4,r7,0 # temp = *word address
addc r3,r3,r4 # csum = csum + temp + carry
addik r11,r11,-1 # word count = word count - 1, don't disturb carry
brid 5b
addik r7,r7,4 # word address++, don't disturb carry - IN DELAY SLOT
4: # deal with last (possibly partial) word
lwi r4,r7,0 # temp = *word address
and r4,r4,r10 # temp = temp & endmask
addc r3,r3,r4 # csum = csum + temp, include carry
3: # sum the halfwords in the result
bsrli r4,r3,16 # temp = csum >> 16
andi r3,r3,0x0000ffff # csum = csum & 0x0000ffff, need .imm here to override sign ext.
addc r3,r3,r4 # csum = csum + temp + carry, this might have carried out of ls half-word
# .. so add the high half-word back in again
bsrli r4,r3,16 # temp = csum >> 16
andi r3,r3,0x0000ffff # csum = csum & 0x0000ffff, need .imm here to override sign ext.
add r3,r3,r4 # csum = csum + temp, this will never carry out of ls half-word
andi r4,r5,1 # temp = buff address & 1, check if high/low bytes need to be swapped
beqi r4,1f # started on half-word boundary, ok to not swap
# swap the high / low bytes in the 16 bit csum
bsrli r4,r3,8 # temp = csum >> 8, no high bits on, so no need to mask
andi r3,r3,0x000000ff # csum = csum & 0xff, implied sign extend ok
bslli r3,r3,8 # csum = csum << 8
or r3,r3,r4 # csum = csum | temp
# Restore Frame and return
1: rtsd r15,8
nop
.end C_SYMBOL_NAME(do_csum)