[PATCH] Improved strlen for ARM, around 29% faster

Khem Raj raj.khem at gmail.com
Wed Oct 3 18:59:54 UTC 2012


On Fri, Sep 28, 2012 at 5:48 PM, Gabriel Gonzalez
<gabriel.gonzalez.garcia at gmail.com> wrote:
> This version for ARM improves performance mainly unrolling the loop for iterations
> and reducing the instructions need to look for the null character.
> A deeper analysis of this can be found at http://www.gabrielgonzalezgarcia.com/2012/10/02/mystrlen-vs-android-bionics-strlen-on-arm-cpu/
> where you can find some data which back up the performance improvement.
> I have only tested it on a little endian CPU so the BIG ENDIAN chunk might need some testing
>
> Signed-off-by: Gabriel Gonzalez <gabriel.gonzalez.garcia at gmail.com>
> ---
>  libc/string/arm/strlen.S |  123 +++++++++++++++++++++++++++-------------------
>  1 file changed, 72 insertions(+), 51 deletions(-)
>
> diff --git a/libc/string/arm/strlen.S b/libc/string/arm/strlen.S
> index 949e918..b87fbb4 100644
> --- a/libc/string/arm/strlen.S
> +++ b/libc/string/arm/strlen.S
> @@ -46,63 +46,84 @@ strlen:
>         bx lr
>  #else
>  strlen:
> -       bic     r1, r0, $3              @ addr of word containing first byte
> -       ldr     r2, [r1], $4            @ get the first word
> -       ands    r3, r0, $3              @ how many bytes are duff?
> -       rsb     r0, r3, $0              @ get - that number into counter.
> -       beq     Laligned                @ skip into main check routine if no
> -                                       @ more
> -#if __BYTE_ORDER == __BIG_ENDIAN
> -       orr     r2, r2, $0xff000000     @ set this byte to non-zero
> -       subs    r3, r3, $1              @ any more to do?
> -       IT(t, gt)
> -       orrgt   r2, r2, $0x00ff0000     @ if so, set this byte
> -       subs    r3, r3, $1              @ more?
> -       IT(t, gt)
> -       orrgt   r2, r2, $0x0000ff00     @ then set.
> -#else
> -       orr     r2, r2, $0x000000ff     @ set this byte to non-zero
> -       subs    r3, r3, $1              @ any more to do?
> -       IT(t, gt)
> -       orrgt   r2, r2, $0x0000ff00     @ if so, set this byte
> -       subs    r3, r3, $1              @ more?
> -       IT(t, gt)
> -       orrgt   r2, r2, $0x00ff0000     @ then set.
> -#endif
> -Laligned:                              @ here, we have a word in r2.  Does it
> -       tst     r2, $0x000000ff         @ contain any zeroes?
> -       IT(tttt, ne)
> -       tstne   r2, $0x0000ff00         @
> -       tstne   r2, $0x00ff0000         @
> -       tstne   r2, $0xff000000         @
> -       addne   r0, r0, $4              @ if not, the string is 4 bytes longer
> -       IT(t, ne)
> -       ldrne   r2, [r1], $4            @ and we continue to the next word
> -       bne     Laligned                @
> -Llastword:                             @ drop through to here once we find a
> +       stmfd   sp!, {v1, v2, v3, v4, v5, v6, v7, lr}
> +
> +       mov     v7, a1
> +       ldr     v6,  =0x80808080
> +
> +       ##
> +       ## un-aligned address till we get aligned
> +       ##
> +1:     tst     v7, #3
> +       beq     0f
> +       ldrb    v1, [v7], #1
> +       tst     v1, #0xFF
> +       beq     4f
> +       bal     1b
> +
> +
> +       ## un-rolling strings
> +       ## as few instructions in the loop
> +       ## as possible
> +       ## - Check whether any position equals 0
> +0:     ldmfd   v7!, {v1, v2, v3, v4}
> +       sub     v5, v1, v6, LSR #7
> +       and     v5, v5, v6
> +       bics    v5, v5, v1
> +       bne     1f
> +       sub     v5, v2, v6, LSR #7
> +       and     v5, v5, v6
> +       bics    v5, v5, v2
> +       bne     2f
> +       sub     v5, v3, v6, LSR #7
> +       and     v5, v5, v6
> +       bics    v5, v5, v3
> +       bne     3f
> +       sub     v5, v4, v6, LSR #7
> +       and     v5, v5, v6
> +       bics    v5, v5, v4
> +       bne     4f
> +       beq     0b
> +
> +4:     mov     v1, v4
> +       bal     0f
> +3:     mov     v1, v3
> +       sub     v7, v7, #4
> +       bal     0f
> +2:     mov     v1, v2
> +       sub     v7, v7, #8
> +       bal     0f
> +1:     sub     v7, v7, #12
> +       ## After the loop we calculate the diff
> +       ## between the end and the begining of the str
> +       ##
>  #if __BYTE_ORDER == __BIG_ENDIAN
> -       tst     r2, $0xff000000         @ word that has a zero byte in it
> -       IT(tttt, ne)
> -       addne   r0, r0, $1              @
> -       tstne   r2, $0x00ff0000         @ and add up to 3 bytes on to it
> -       addne   r0, r0, $1              @
> -       tstne   r2, $0x0000ff00         @ (if first three all non-zero, 4th
> -       IT(t, ne)
> -       addne   r0, r0, $1              @  must be zero)
> +0:     tst     v1, #0xFF000000
> +       subeq   v7, v7, #1
> +       tst     v1, #0x00FF0000
> +       subeq   v7, v7, #1
> +       tst     v1, #0x0000FF00
> +       subeq   v7, v7, #1
> +       tst     v1, #0x000000FF
> +4:     subeq   v7, v7, #1
>  #else
> -       tst     r2, $0x000000ff         @
> -       IT(tttt, ne)
> -       addne   r0, r0, $1              @
> -       tstne   r2, $0x0000ff00         @ and add up to 3 bytes on to it
> -       addne   r0, r0, $1              @
> -       tstne   r2, $0x00ff0000         @ (if first three all non-zero, 4th
> -       IT(t, ne)
> -       addne   r0, r0, $1              @  must be zero)
> +0:     tst     v1, #0xFF
> +       subeq   v7, v7, #1
> +       tst     v1, #0xFF00
> +       subeq   v7, v7, #1
> +       tst     v1, #0xFF0000
> +       subeq   v7, v7, #1
> +       tst     v1, #0xFF000000
> +4:     subeq   v7, v7, #1
>  #endif
> +
> +       sub     a1, v7, a1
> +
>  #if defined(__USE_BX__)
> +       ldmfd   sp!, {v1, v2, v3, v4, v5, v6, v7, lr}
>          bx      lr
>  #else
> -       mov     pc,lr
> +       ldmfd   sp!, {v1, v2, v3, v4, v5, v6, v7, pc}
>  #endif
>  #endif
>
> --
> 1.7.9.5

This looks good. I would like that people try it on various arm's
and give it a good test. Current algorithm may be suboptimal but
it has been time tested.


More information about the uClibc mailing list