[uClibc] improved memcpy/memset on arm in uClibc-0.9.19
Markus Pietrek
maillist at fsforth.de
Tue Sep 16 15:40:34 UTC 2003
Hi folks,
the C implementation of memcpy() and memset() is very slow on ARM. Using
optimized assembler code the performance for 32bit aligned memory blocks can
be improved very much. Some test runs on the NS7520 ARM7 show that for
copying a memory region of 512kB only 20ms are needed instead of 320ms. About
the same ratio can be achieved for memset().
My code works only on the assumption that the memory is 32bit aligned and no
Wchar are used, the block size is large enough (>>0x38 bytes) to complement
the overhead and of course it is an ARM.
So my questions are
o with these restrictions, does it make sense to put the code in this general
library?
o what the best method for replacing C functions with custom assembler
functions. Obviously, hacking libc/string/wstring.c is not good
Here is a preversion of the optimized functions.
Wvoid *Wmemcpy(Wvoid * __restrict s1, const Wvoid * __restrict s2, size_t n)
{
register Wchar *r1 = s1;
register const Wchar *r2 = s2;
if( ((int) r1 & 0x3) || ((int) r2 & 0x3) || ( n <= 0x38 ) )
{
while (n) {
*r1++ = *r2++;
--n;
}
}
else
__asm__ __volatile__ ("
1:
ldmia %0!,{r3-r9}
stmia %1!,{r3-r9}
ldmia %0!,{r3-r9}
stmia %1!,{r3-r9}
subs %2, %2, #0x38
cmp %2, #0x38
bge 1b
2:
cmp %2, #4
blt 3f
ldr r4,[%0]
str r4,[%1]
add %0, %0, #4
add %1, %1, #4
sub %2, %2, #4
b 2b
3:
cmp %2, #0
beq 4f
ldrb r4,[%0]
strb r4,[%1]
add %0, %0, #1
add %1, %1, #1
sub %2, %2, #1
b 3b
4: "
:
: "r"(r2),"r"(r1),"r"(n)
: "r3","r4","r5","r6","r7","r8","r9" );
return s1;
}
Wvoid *Wmemset(Wvoid *s, Wint c, size_t n)
{
register Wuchar *p = (Wuchar *) s;
#ifdef __BCC__
/* bcc can optimize the counter if it thinks it is a pointer... */
register const char *np = (const char *) n;
#else
#define np n
#endif
if( ((int) p & 0x3) || ( n <= 0x38 ) )
{
while (np) {
*p++ = (Wuchar) c;
--np;
}
}
else
__asm__ __volatile__ ("
1:
mov r3, %0
mov r4, #0xff
and r3, r3, r4
orr r3, r3, r3, lsl #8
orr r3, r3, r3, lsl #16
mov r4, r3
mov r5, r3
mov r6, r3
mov r7, r3
mov r8, r3
mov r9, r3
stmia %1!,{r3-r9}
stmia %1!,{r3-r9}
subs %2, %2, #0x38
cmp %2, #0x38
bge 1b
2:
cmp %2, #4
blt 3f
str r3,[%1]
add %1, %1, #4
sub %2, %2, #4
b 2b
3:
cmp %2, #0
beq 4f
strb r3,[%1]
add %1, %1, #1
sub %2, %2, #1
b 3b
4: "
:
: "r"(c),"r"(p),"r"(n)
: "r3","r4","r5","r6","r7","r8","r9" );
return s;
}
Bye,
--
Markus Pietrek
FS Forth-Systeme GmbH
Phone: +49 (7667) 908 145, FAX +49 (7667) 908 221
More information about the uClibc
mailing list