[PATCH] Improved strlen for ARM, around 29% faster

Gabriel Gonzalez gabriel.gonzalez.garcia at gmail.com
Sat Sep 29 00:48:48 UTC 2012


This version for ARM improves performance mainly unrolling the loop for iterations 
and reducing the instructions need to look for the null character.
A deeper analysis of this can be found at http://www.gabrielgonzalezgarcia.com/2012/10/02/mystrlen-vs-android-bionics-strlen-on-arm-cpu/ 
where you can find some data which back up the performance improvement.
I have only tested it on a little endian CPU so the BIG ENDIAN chunk might need some testing

Signed-off-by: Gabriel Gonzalez <gabriel.gonzalez.garcia at gmail.com>
---
 libc/string/arm/strlen.S |  123 +++++++++++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 51 deletions(-)

diff --git a/libc/string/arm/strlen.S b/libc/string/arm/strlen.S
index 949e918..b87fbb4 100644
--- a/libc/string/arm/strlen.S
+++ b/libc/string/arm/strlen.S
@@ -46,63 +46,84 @@ strlen:
 	bx lr
 #else
 strlen:
-	bic     r1, r0, $3              @ addr of word containing first byte
-	ldr     r2, [r1], $4            @ get the first word
-	ands    r3, r0, $3              @ how many bytes are duff?
-	rsb     r0, r3, $0              @ get - that number into counter.
-	beq     Laligned                @ skip into main check routine if no
-					@ more
-#if __BYTE_ORDER == __BIG_ENDIAN
-	orr     r2, r2, $0xff000000     @ set this byte to non-zero
-	subs    r3, r3, $1              @ any more to do?
-	IT(t, gt)
-	orrgt   r2, r2, $0x00ff0000     @ if so, set this byte
-	subs    r3, r3, $1              @ more?
-	IT(t, gt)
-	orrgt   r2, r2, $0x0000ff00     @ then set.
-#else
-	orr     r2, r2, $0x000000ff     @ set this byte to non-zero
-	subs    r3, r3, $1              @ any more to do?
-	IT(t, gt)
-	orrgt   r2, r2, $0x0000ff00     @ if so, set this byte
-	subs    r3, r3, $1              @ more?
-	IT(t, gt)
-	orrgt   r2, r2, $0x00ff0000     @ then set.
-#endif
-Laligned:				@ here, we have a word in r2.  Does it
-	tst     r2, $0x000000ff         @ contain any zeroes?
-	IT(tttt, ne)
-	tstne   r2, $0x0000ff00         @
-	tstne   r2, $0x00ff0000         @
-	tstne   r2, $0xff000000         @
-	addne   r0, r0, $4              @ if not, the string is 4 bytes longer
-	IT(t, ne)
-	ldrne   r2, [r1], $4            @ and we continue to the next word
-	bne     Laligned                @
-Llastword:				@ drop through to here once we find a
+	stmfd	sp!, {v1, v2, v3, v4, v5, v6, v7, lr}
+
+	mov	v7, a1
+	ldr	v6,  =0x80808080
+
+	##
+	## un-aligned address till we get aligned
+	##
+1:	tst	v7, #3
+	beq	0f
+	ldrb	v1, [v7], #1
+	tst	v1, #0xFF
+	beq	4f
+	bal	1b
+
+
+	## un-rolling strings
+	## as few instructions in the loop
+	## as possible
+	## - Check whether any position equals 0
+0:	ldmfd	v7!, {v1, v2, v3, v4}
+	sub	v5, v1, v6, LSR #7
+	and	v5, v5, v6
+	bics	v5, v5, v1
+	bne	1f
+	sub	v5, v2, v6, LSR #7
+	and	v5, v5, v6
+	bics	v5, v5, v2
+	bne	2f
+	sub	v5, v3, v6, LSR #7
+	and	v5, v5, v6
+	bics	v5, v5, v3
+	bne	3f
+	sub	v5, v4, v6, LSR #7
+	and	v5, v5, v6
+	bics	v5, v5, v4
+	bne	4f
+	beq	0b
+
+4:	mov	v1, v4
+	bal	0f
+3:	mov	v1, v3
+	sub	v7, v7, #4
+	bal	0f
+2:	mov	v1, v2
+	sub	v7, v7, #8
+	bal	0f
+1:	sub	v7, v7, #12
+	## After the loop we calculate the diff
+	## between the end and the begining of the str
+	##
 #if __BYTE_ORDER == __BIG_ENDIAN
-	tst     r2, $0xff000000         @ word that has a zero byte in it
-	IT(tttt, ne)
-	addne   r0, r0, $1              @
-	tstne   r2, $0x00ff0000         @ and add up to 3 bytes on to it
-	addne   r0, r0, $1              @
-	tstne   r2, $0x0000ff00         @ (if first three all non-zero, 4th
-	IT(t, ne)
-	addne   r0, r0, $1              @  must be zero)
+0:	tst	v1, #0xFF000000 
+	subeq	v7, v7, #1
+	tst	v1, #0x00FF0000
+	subeq	v7, v7, #1
+	tst	v1, #0x0000FF00
+	subeq	v7, v7, #1
+	tst	v1, #0x000000FF
+4:	subeq	v7, v7, #1
 #else
-	tst     r2, $0x000000ff         @
-	IT(tttt, ne)
-	addne   r0, r0, $1              @
-	tstne   r2, $0x0000ff00         @ and add up to 3 bytes on to it
-	addne   r0, r0, $1              @
-	tstne   r2, $0x00ff0000         @ (if first three all non-zero, 4th
-	IT(t, ne)
-	addne   r0, r0, $1              @  must be zero)
+0:	tst	v1, #0xFF 
+	subeq	v7, v7, #1
+	tst	v1, #0xFF00
+	subeq	v7, v7, #1
+	tst	v1, #0xFF0000
+	subeq	v7, v7, #1
+	tst	v1, #0xFF000000
+4:	subeq	v7, v7, #1
 #endif
+
+	sub	a1, v7, a1
+
 #if defined(__USE_BX__)
+	ldmfd	sp!, {v1, v2, v3, v4, v5, v6, v7, lr}
         bx      lr
 #else
-  	mov	pc,lr
+	ldmfd	sp!, {v1, v2, v3, v4, v5, v6, v7, pc}
 #endif
 #endif
 
-- 
1.7.9.5



More information about the uClibc mailing list