[PATCH] Replace MIPS specific memcpy.S/memset.S with version from glibc/newlib.

Tue Dec 22 00:43:11 UTC 2015

Hi Steve

On Mon, Dec 21, 2015 at 1:22 PM, Steve Ellcey <sellcey at imgtec.com> wrote:
> These MIPS specific versions of memcpy.S and memset.S are faster than
> the current ones and match what is in newlib and glibc.  They also have
> support for the mips32r6 and mips64r6 architectures.
>

what is the size impact ? Would be nice to have that report as well.

> Signed-off-by: Steve Ellcey <sellcey at imgtec.com>
> ---
>  libc/string/mips/memcpy.S | 1051 +++++++++++++++++++++++++++++++++++----------
>  libc/string/mips/memset.S |  516 ++++++++++++++++------
>  2 files changed, 1229 insertions(+), 338 deletions(-)
>
> diff --git a/libc/string/mips/memcpy.S b/libc/string/mips/memcpy.S
> index 48c4f2a..2a187ef 100644
> --- a/libc/string/mips/memcpy.S
> +++ b/libc/string/mips/memcpy.S
> @@ -1,6 +1,5 @@
> -/* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
> +/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
> -   Contributed by Hartvig Ekner <hartvige at mips.com>, 2002.
>
>     The GNU C Library is free software; you can redistribute it and/or
>     modify it under the terms of the GNU Lesser General Public
> @@ -13,243 +12,861 @@
>     Lesser General Public License for more details.
>
>     You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> +   License along with the GNU C Library.  If not, see
>     <http://www.gnu.org/licenses/>.  */
>
> -#include <features.h>
> -#include <sysdep.h>
> -#include <endian.h>
> +#ifdef ANDROID_CHANGES
> +# include "machine/asm.h"
> +# include "machine/regdef.h"
> +# define USE_MEMMOVE_FOR_OVERLAP
> +# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
> +# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#elif _LIBC
> +# include <sysdep.h>
> +# include <regdef.h>
> +# include <sys/asm.h>
> +# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
> +# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#elif defined _COMPILING_NEWLIB
> +# include "machine/asm.h"
> +# include "machine/regdef.h"
> +# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
> +# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#else
> +# include <regdef.h>
> +# include <sys/asm.h>
> +#endif
> +
> +#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
> +    (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
> +# ifndef DISABLE_PREFETCH
> +#  define USE_PREFETCH
> +# endif
> +#endif
> +
> +#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
> +# ifndef DISABLE_DOUBLE
> +#  define USE_DOUBLE
> +# endif
> +#endif
> +
> +/* Some asm.h files do not have the L macro definition.  */
> +#ifndef L
> +# if _MIPS_SIM == _ABIO32
> +#  define L(label) $L ## label
> +# else
> +#  define L(label) .L ## label
> +# endif
> +#endif
> +
> +/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
> +#ifndef PTR_ADDIU
> +# ifdef USE_DOUBLE
> +#  define PTR_ADDIU    daddiu
> +# else
> +#  define PTR_ADDIU    addiu
> +# endif
> +#endif
> +
> +/* Some asm.h files do not have the PTR_SRA macro definition.  */
> +#ifndef PTR_SRA
> +# ifdef USE_DOUBLE
> +#  define PTR_SRA              dsra
> +# else
> +#  define PTR_SRA              sra
> +# endif
> +#endif
> +
> +/* New R6 instructions that may not be in asm.h.  */
> +#ifndef PTR_LSA
> +# if _MIPS_SIM == _ABI64
> +#  define PTR_LSA      dlsa
> +# else
> +#  define PTR_LSA      lsa
> +# endif
> +#endif
> +
> +/*
> + * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
> + * prefetches appears to offer a slight preformance advantage.
> + *
> + * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
> + * or PREFETCH_STORE_STREAMED offers a large performance advantage
> + * but PREPAREFORSTORE has some special restrictions to consider.
> + *
> + * Prefetch with the 'prepare for store' hint does not copy a memory
> + * location into the cache, it just allocates a cache line and zeros
> + * it out.  This means that if you do not write to the entire cache
> + * line before writing it out to memory some data will get zero'ed out
> + * when the cache line is written back to memory and data will be lost.
> + *
> + * Also if you are using this memcpy to copy overlapping buffers it may
> + * not behave correctly when using the 'prepare for store' hint.  If you
> + * use the 'prepare for store' prefetch on a memory area that is in the
> + * memcpy source (as well as the memcpy destination), then you will get
> + * some data zero'ed out before you have a chance to read it and data will
> + * be lost.
> + *
> + * If you are going to use this memcpy routine with the 'prepare for store'
> + * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
> + * the problem of running memcpy on overlapping buffers.
> + *
> + * There are ifdef'ed sections of this memcpy to make sure that it does not
> + * do prefetches on cache lines that are not going to be completely written.
> + * This code is only needed and only used when PREFETCH_STORE_HINT is set to
> + * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
> + * 32 bytes and if the cache line is larger it will not work correctly.
> + */
> +
> +#ifdef USE_PREFETCH
> +# define PREFETCH_HINT_LOAD            0
> +# define PREFETCH_HINT_STORE           1
> +# define PREFETCH_HINT_LOAD_STREAMED   4
> +# define PREFETCH_HINT_STORE_STREAMED  5
> +# define PREFETCH_HINT_LOAD_RETAINED   6
> +# define PREFETCH_HINT_STORE_RETAINED  7
> +# define PREFETCH_HINT_WRITEBACK_INVAL 25
> +# define PREFETCH_HINT_PREPAREFORSTORE 30
> +
> +/*
> + * If we have not picked out what hints to use at this point use the
> + * standard load and store prefetch hints.
> + */
> +# ifndef PREFETCH_STORE_HINT
> +#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
> +# endif
> +# ifndef PREFETCH_LOAD_HINT
> +#  define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
> +# endif
> +
> +/*
> + * We double everything when USE_DOUBLE is true so we do 2 prefetches to
> + * get 64 bytes in that case.  The assumption is that each individual
> + * prefetch brings in 32 bytes.
> + */
> +
> +# ifdef USE_DOUBLE
> +#  define PREFETCH_CHUNK 64
> +#  define PREFETCH_FOR_LOAD(chunk, reg) \
> + pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \
> + pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg)
> +#  define PREFETCH_FOR_STORE(chunk, reg) \
> + pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
> + pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
> +# else
> +#  define PREFETCH_CHUNK 32
> +#  define PREFETCH_FOR_LOAD(chunk, reg) \
> + pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
> +#  define PREFETCH_FOR_STORE(chunk, reg) \
> + pref PREFETCH_STORE_HINT, (chunk)*32(reg)
> +# endif
> +/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
> + * than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
> + * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
> + * hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
> + * used then MAX_PREFETCH_SIZE does not matter.  */
> +# define MAX_PREFETCH_SIZE 128
> +/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
> + * than 5 on a STORE prefetch and that a single prefetch can never be larger
> + * than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
> + * we actually do two prefetches in that case, one 32 bytes after the other.  */
> +# ifdef USE_DOUBLE
> +#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
> +# else
> +#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
> +# endif
> +# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
> +    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
> +/* We cannot handle this because the initial prefetches may fetch bytes that
> + * are before the buffer being copied.  We start copies with an offset
> + * of 4 so avoid this situation when using PREPAREFORSTORE.  */
> +#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
> +# endif
> +#else /* USE_PREFETCH not defined */
> +# define PREFETCH_FOR_LOAD(offset, reg)
> +# define PREFETCH_FOR_STORE(offset, reg)
> +#endif
> +
> +#if __mips_isa_rev > 5
> +# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +#  undef PREFETCH_STORE_HINT
> +#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
> +# endif
> +# define R6_CODE
> +#endif
>
> -/* void *memcpy(void *s1, const void *s2, size_t n);  */
> +/* Allow the routine to be named something else if desired.  */
> +#ifndef MEMCPY_NAME
> +# define MEMCPY_NAME memcpy
> +#endif
> +
> +/* We use these 32/64 bit registers as temporaries to do the copying.  */
> +#define REG0 t0
> +#define REG1 t1
> +#define REG2 t2
> +#define REG3 t3
> +#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABIO32) || (_MIPS_SIM == _ABIO64))
> +# define REG4 t4
> +# define REG5 t5
> +# define REG6 t6
> +# define REG7 t7
> +#else
> +# define REG4 ta0
> +# define REG5 ta1
> +# define REG6 ta2
> +# define REG7 ta3
> +#endif
>
> -#ifdef __mips64
> +/* We load/store 64 bits at a time when USE_DOUBLE is true.
> + * The C_ prefix stands for CHUNK and is used to avoid macro name
> + * conflicts with system header files.  */
>
> -#include <sys/asm.h>
> +#ifdef USE_DOUBLE
> +# define C_ST  sd
> +# define C_LD  ld
> +# ifdef __MIPSEB
> +#  define C_LDHI       ldl     /* high part is left in big-endian      */
> +#  define C_STHI       sdl     /* high part is left in big-endian      */
> +#  define C_LDLO       ldr     /* low part is right in big-endian      */
> +#  define C_STLO       sdr     /* low part is right in big-endian      */
> +# else
> +#  define C_LDHI       ldr     /* high part is right in little-endian  */
> +#  define C_STHI       sdr     /* high part is right in little-endian  */
> +#  define C_LDLO       ldl     /* low part is left in little-endian    */
> +#  define C_STLO       sdl     /* low part is left in little-endian    */
> +# endif
> +# define C_ALIGN       dalign  /* r6 align instruction                 */
> +#else
> +# define C_ST  sw
> +# define C_LD  lw
> +# ifdef __MIPSEB
> +#  define C_LDHI       lwl     /* high part is left in big-endian      */
> +#  define C_STHI       swl     /* high part is left in big-endian      */
> +#  define C_LDLO       lwr     /* low part is right in big-endian      */
> +#  define C_STLO       swr     /* low part is right in big-endian      */
> +# else
> +#  define C_LDHI       lwr     /* high part is right in little-endian  */
> +#  define C_STHI       swr     /* high part is right in little-endian  */
> +#  define C_LDLO       lwl     /* low part is left in little-endian    */
> +#  define C_STLO       swl     /* low part is left in little-endian    */
> +# endif
> +# define C_ALIGN       align   /* r6 align instruction                 */
> +#endif
>
> -#if __BYTE_ORDER == __BIG_ENDIAN
> -#  define LDHI ldl             /* high part is left in big-endian      */
> -#  define SDHI sdl             /* high part is left in big-endian      */
> -#  define LDLO ldr             /* low part is right in big-endian      */
> -#  define SDLO sdr             /* low part is right in big-endian      */
> +/* Bookkeeping values for 32 vs. 64 bit mode.  */
> +#ifdef USE_DOUBLE
> +# define NSIZE 8
> +# define NSIZEMASK 0x3f
> +# define NSIZEDMASK 0x7f
>  #else
> -#  define LDHI ldr             /* high part is right in little-endian  */
> -#  define SDHI sdr             /* high part is right in little-endian  */
> -#  define LDLO ldl             /* low part is left in little-endian    */
> -#  define SDLO sdl             /* low part is left in little-endian    */
> +# define NSIZE 4
> +# define NSIZEMASK 0x1f
> +# define NSIZEDMASK 0x3f
>  #endif
> +#define UNIT(unit) ((unit)*NSIZE)
> +#define UNITM1(unit) (((unit)*NSIZE)-1)
>
> -ENTRY (memcpy)
> +#ifdef ANDROID_CHANGES
> +LEAF(MEMCPY_NAME, 0)
> +#else
> +LEAF(MEMCPY_NAME)
> +#endif
> +       .set    nomips16
>         .set    noreorder
> +/*
> + * Below we handle the case where memcpy is called with overlapping src and dst.
> + * Although memcpy is not required to handle this case, some parts of Android
> + * like Skia rely on such usage. We call memmove to handle such cases.
> + */
> +#ifdef USE_MEMMOVE_FOR_OVERLAP
> +       PTR_SUBU t0,a0,a1
> +       PTR_SRA t2,t0,31
> +       xor     t1,t0,t2
> +       PTR_SUBU t0,t1,t2
> +       sltu    t2,t0,a2
> +       beq     t2,zero,L(memcpy)
> +       la      t9,memmove
> +       jr      t9
> +        nop
> +L(memcpy):
> +#endif
> +/*
> + * If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
> + * size, copy dst pointer to v0 for the return value.
> + */
> +       slti    t2,a2,(2 * NSIZE)
> +       bne     t2,zero,L(lasts)
> +#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH)
> +       move    v0,zero
> +#else
> +       move    v0,a0
> +#endif
>
> -       slti    t0, a2, 16              # Less than 16?
> -       bne     t0, zero, L(last16)
> -       move    v0, a0                  # Setup exit value before too late
> -
> -       xor     t0, a1, a0              # Find a0/a1 displacement
> -       andi    t0, 0x7
> -       bne     t0, zero, L(shift)      # Go handle the unaligned case
> -       PTR_SUBU t1, zero, a1
> -       andi    t1, 0x7                 # a0/a1 are aligned, but are we
> -       beq     t1, zero, L(chk8w)      #  starting in the middle of a word?
> -       PTR_SUBU a2, t1
> -       LDHI    t0, 0(a1)               # Yes we are... take care of that
> -       PTR_ADDU a1, t1
> -       SDHI    t0, 0(a0)
> -       PTR_ADDU a0, t1
> -
> -L(chk8w):
> -       andi    t0, a2, 0x3f            # 64 or more bytes left?
> -       beq     t0, a2, L(chk1w)
> -       PTR_SUBU a3, a2, t0             # Yes
> -       PTR_ADDU a3, a1                 # a3 = end address of loop
> -       move    a2, t0                  # a2 = what will be left after loop
> -L(lop8w):
> -       ld      t0,  0(a1)              # Loop taking 8 words at a time
> -       ld      t1,  8(a1)
> -       ld      t2, 16(a1)
> -       ld      t3, 24(a1)
> -       ld      ta0, 32(a1)
> -       ld      ta1, 40(a1)
> -       ld      ta2, 48(a1)
> -       ld      ta3, 56(a1)
> -       PTR_ADDIU a0, 64
> -       PTR_ADDIU a1, 64
> -       sd      t0, -64(a0)
> -       sd      t1, -56(a0)
> -       sd      t2, -48(a0)
> -       sd      t3, -40(a0)
> -       sd      ta0, -32(a0)
> -       sd      ta1, -24(a0)
> -       sd      ta2, -16(a0)
> -       bne     a1, a3, L(lop8w)
> -       sd      ta3,  -8(a0)
> +#ifndef R6_CODE
>
> -L(chk1w):
> -       andi    t0, a2, 0x7             # 8 or more bytes left?
> -       beq     t0, a2, L(last16)
> -       PTR_SUBU a3, a2, t0             # Yes, handle them one dword at a time
> -       PTR_ADDU a3, a1                 # a3 again end address
> -       move    a2, t0
> -L(lop1w):
> -       ld      t0, 0(a1)
> -       PTR_ADDIU a0, 8
> -       PTR_ADDIU a1, 8
> -       bne     a1, a3, L(lop1w)
> -       sd      t0, -8(a0)
> -
> -L(last16):
> -       blez    a2, L(lst16e)           # Handle last 16 bytes, one at a time
> -       PTR_ADDU a3, a2, a1
> -L(lst16l):
> -       lb      t0, 0(a1)
> -       PTR_ADDIU a0, 1
> -       PTR_ADDIU a1, 1
> -       bne     a1, a3, L(lst16l)
> -       sb      t0, -1(a0)
> -L(lst16e):
> -       jr      ra                      # Bye, bye
> -       nop
> +/*
> + * If src and dst have different alignments, go to L(unaligned), if they
> + * have the same alignment (but are not actually aligned) do a partial
> + * load/store to make them aligned.  If they are both already aligned
> + * we can start copying at L(aligned).
> + */
> +       xor     t8,a1,a0
> +       andi    t8,t8,(NSIZE-1)         /* t8 is a0/a1 word-displacement */
> +       bne     t8,zero,L(unaligned)
> +       PTR_SUBU a3, zero, a0
>
> -L(shift):
> -       PTR_SUBU a3, zero, a0           # Src and Dest unaligned
> -       andi    a3, 0x7                 #  (unoptimized case...)
> -       beq     a3, zero, L(shft1)
> -       PTR_SUBU a2, a3                 # a2 = bytes left
> -       LDHI    t0, 0(a1)               # Take care of first odd part
> -       LDLO    t0, 7(a1)
> -       PTR_ADDU a1, a3
> -       SDHI    t0, 0(a0)
> -       PTR_ADDU a0, a3
> -L(shft1):
> -       andi    t0, a2, 0x7
> -       PTR_SUBU a3, a2, t0
> -       PTR_ADDU a3, a1
> -L(shfth):
> -       LDHI    t1, 0(a1)               # Limp through, dword by dword
> -       LDLO    t1, 7(a1)
> -       PTR_ADDIU a0, 8
> -       PTR_ADDIU a1, 8
> -       bne     a1, a3, L(shfth)
> -       sd      t1, -8(a0)
> -       b       L(last16)               # Handle anything which may be left
> -       move    a2, t0
> +       andi    a3,a3,(NSIZE-1)         /* copy a3 bytes to align a0/a1   */
> +       beq     a3,zero,L(aligned)      /* if a3=0, it is already aligned */
> +       PTR_SUBU a2,a2,a3               /* a2 is the remining bytes count */
>
> -       .set    reorder
> -END (memcpy)
> +       C_LDHI  t8,0(a1)
> +       PTR_ADDU a1,a1,a3
> +       C_STHI  t8,0(a0)
> +       PTR_ADDU a0,a0,a3
> +
> +#else /* R6_CODE */
> +
> +/*
> + * Align the destination and hope that the source gets aligned too.  If it
> + * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
> + * align instruction.
> + */
> +       andi    t8,a0,7
> +       lapc    t9,L(atable)
> +       PTR_LSA t9,t8,t9,2
> +       jrc     t9
> +L(atable):
> +       bc      L(lb0)
> +       bc      L(lb7)
> +       bc      L(lb6)
> +       bc      L(lb5)
> +       bc      L(lb4)
> +       bc      L(lb3)
> +       bc      L(lb2)
> +       bc      L(lb1)
> +L(lb7):
> +       lb      a3, 6(a1)
> +       sb      a3, 6(a0)
> +L(lb6):
> +       lb      a3, 5(a1)
> +       sb      a3, 5(a0)
> +L(lb5):
> +       lb      a3, 4(a1)
> +       sb      a3, 4(a0)
> +L(lb4):
> +       lb      a3, 3(a1)
> +       sb      a3, 3(a0)
> +L(lb3):
> +       lb      a3, 2(a1)
> +       sb      a3, 2(a0)
> +L(lb2):
> +       lb      a3, 1(a1)
> +       sb      a3, 1(a0)
> +L(lb1):
> +       lb      a3, 0(a1)
> +       sb      a3, 0(a0)
> +
> +       li      t9,8
> +       subu    t8,t9,t8
> +       PTR_SUBU a2,a2,t8
> +       PTR_ADDU a0,a0,t8
> +       PTR_ADDU a1,a1,t8
> +L(lb0):
>
> -#else /* !__mips64 */
> +       andi    t8,a1,(NSIZE-1)
> +       lapc    t9,L(jtable)
> +       PTR_LSA t9,t8,t9,2
> +       jrc     t9
> +L(jtable):
> +        bc      L(aligned)
> +        bc      L(r6_unaligned1)
> +        bc      L(r6_unaligned2)
> +        bc      L(r6_unaligned3)
> +# ifdef USE_DOUBLE
> +        bc      L(r6_unaligned4)
> +        bc      L(r6_unaligned5)
> +        bc      L(r6_unaligned6)
> +        bc      L(r6_unaligned7)
> +# endif
> +#endif /* R6_CODE */
>
> -#if __BYTE_ORDER == __BIG_ENDIAN
> -#  define LWHI lwl             /* high part is left in big-endian      */
> -#  define SWHI swl             /* high part is left in big-endian      */
> -#  define LWLO lwr             /* low part is right in big-endian      */
> -#  define SWLO swr             /* low part is right in big-endian      */
> +L(aligned):
> +
> +/*
> + * Now dst/src are both aligned to (word or double word) aligned addresses
> + * Set a2 to count how many bytes we have to copy after all the 64/128 byte
> + * chunks are copied and a3 to the dst pointer after all the 64/128 byte
> + * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
> + * equals a3.
> + */
> +
> +       andi    t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
> +       beq     a2,t8,L(chkw)    /* if a2==t8, no 64-byte/128-byte chunks */
> +       PTR_SUBU a3,a2,t8        /* subtract from a2 the reminder */
> +       PTR_ADDU a3,a0,a3        /* Now a3 is the final dst after loop */
> +
> +/* When in the loop we may prefetch with the 'prepare to store' hint,
> + * in this case the a0+x should not be past the "t0-32" address.  This
> + * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
> + * for x=64 the last "safe" a0 address is "t0-96" In the current version we
> + * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
> + */
> +#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +       PTR_ADDU t0,a0,a2               /* t0 is the "past the end" address */
> +       PTR_SUBU t9,t0,PREFETCH_LIMIT   /* t9 is the "last safe pref" address */
> +#endif
> +       PREFETCH_FOR_LOAD  (0, a1)
> +       PREFETCH_FOR_LOAD  (1, a1)
> +       PREFETCH_FOR_LOAD  (2, a1)
> +       PREFETCH_FOR_LOAD  (3, a1)
> +#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
> +       PREFETCH_FOR_STORE (1, a0)
> +       PREFETCH_FOR_STORE (2, a0)
> +       PREFETCH_FOR_STORE (3, a0)
> +#endif
> +#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
> +# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
> +       sltu    v1,t9,a0
> +       bgtz    v1,L(skip_set)
> +       nop
> +       PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
> +L(skip_set):
> +# else
> +       PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
> +# endif
> +#endif
> +#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \
> +    && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
> +       PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3)
> +# ifdef USE_DOUBLE
> +       PTR_ADDIU v0,v0,32
> +# endif
> +#endif
> +L(loop16w):
> +       C_LD    t0,UNIT(0)(a1)
> +#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +       sltu    v1,t9,a0                /* If a0 > t9 don't use next prefetch */
> +       bgtz    v1,L(skip_pref)
> +#endif
> +       C_LD    t1,UNIT(1)(a1)
> +#ifdef R6_CODE
> +       PREFETCH_FOR_STORE (2, a0)
>  #else
> -#  define LWHI lwr             /* high part is right in little-endian  */
> -#  define SWHI swr             /* high part is right in little-endian  */
> -#  define LWLO lwl             /* low part is left in little-endian    */
> -#  define SWLO swl             /* low part is left in little-endian    */
> +       PREFETCH_FOR_STORE (4, a0)
> +       PREFETCH_FOR_STORE (5, a0)
> +#endif
> +#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
> +       PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
> +# ifdef USE_DOUBLE
> +       PTR_ADDIU v0,v0,32
> +# endif
>  #endif
> +L(skip_pref):
> +       C_LD    REG2,UNIT(2)(a1)
> +       C_LD    REG3,UNIT(3)(a1)
> +       C_LD    REG4,UNIT(4)(a1)
> +       C_LD    REG5,UNIT(5)(a1)
> +       C_LD    REG6,UNIT(6)(a1)
> +       C_LD    REG7,UNIT(7)(a1)
> +#ifdef R6_CODE
> +       PREFETCH_FOR_LOAD (3, a1)
> +#else
> +       PREFETCH_FOR_LOAD (4, a1)
> +#endif
> +       C_ST    t0,UNIT(0)(a0)
> +       C_ST    t1,UNIT(1)(a0)
> +       C_ST    REG2,UNIT(2)(a0)
> +       C_ST    REG3,UNIT(3)(a0)
> +       C_ST    REG4,UNIT(4)(a0)
> +       C_ST    REG5,UNIT(5)(a0)
> +       C_ST    REG6,UNIT(6)(a0)
> +       C_ST    REG7,UNIT(7)(a0)
>
> -ENTRY (memcpy)
> -       .set    noreorder
> +       C_LD    t0,UNIT(8)(a1)
> +       C_LD    t1,UNIT(9)(a1)
> +       C_LD    REG2,UNIT(10)(a1)
> +       C_LD    REG3,UNIT(11)(a1)
> +       C_LD    REG4,UNIT(12)(a1)
> +       C_LD    REG5,UNIT(13)(a1)
> +       C_LD    REG6,UNIT(14)(a1)
> +       C_LD    REG7,UNIT(15)(a1)
> +#ifndef R6_CODE
> +        PREFETCH_FOR_LOAD (5, a1)
> +#endif
> +       C_ST    t0,UNIT(8)(a0)
> +       C_ST    t1,UNIT(9)(a0)
> +       C_ST    REG2,UNIT(10)(a0)
> +       C_ST    REG3,UNIT(11)(a0)
> +       C_ST    REG4,UNIT(12)(a0)
> +       C_ST    REG5,UNIT(13)(a0)
> +       C_ST    REG6,UNIT(14)(a0)
> +       C_ST    REG7,UNIT(15)(a0)
> +       PTR_ADDIU a0,a0,UNIT(16)        /* adding 64/128 to dest */
> +       bne     a0,a3,L(loop16w)
> +       PTR_ADDIU a1,a1,UNIT(16)        /* adding 64/128 to src */
> +       move    a2,t8
> +
> +/* Here we have src and dest word-aligned but less than 64-bytes or
> + * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
> + * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
> + * the copy.
> + */
> +
> +L(chkw):
> +       PREFETCH_FOR_LOAD (0, a1)
> +       andi    t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk.  */
> +                               /* The t8 is the reminder count past 32-bytes */
> +       beq     a2,t8,L(chk1w)  /* When a2=t8, no 32-byte chunk  */
> +       nop
> +       C_LD    t0,UNIT(0)(a1)
> +       C_LD    t1,UNIT(1)(a1)
> +       C_LD    REG2,UNIT(2)(a1)
> +       C_LD    REG3,UNIT(3)(a1)
> +       C_LD    REG4,UNIT(4)(a1)
> +       C_LD    REG5,UNIT(5)(a1)
> +       C_LD    REG6,UNIT(6)(a1)
> +       C_LD    REG7,UNIT(7)(a1)
> +       PTR_ADDIU a1,a1,UNIT(8)
> +       C_ST    t0,UNIT(0)(a0)
> +       C_ST    t1,UNIT(1)(a0)
> +       C_ST    REG2,UNIT(2)(a0)
> +       C_ST    REG3,UNIT(3)(a0)
> +       C_ST    REG4,UNIT(4)(a0)
> +       C_ST    REG5,UNIT(5)(a0)
> +       C_ST    REG6,UNIT(6)(a0)
> +       C_ST    REG7,UNIT(7)(a0)
> +       PTR_ADDIU a0,a0,UNIT(8)
> +
> +/*
> + * Here we have less than 32(64) bytes to copy.  Set up for a loop to
> + * copy one word (or double word) at a time.  Set a2 to count how many
> + * bytes we have to copy after all the word (or double word) chunks are
> + * copied and a3 to the dst pointer after all the (d)word chunks have
> + * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
> + */
> +L(chk1w):
> +       andi    a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
> +       beq     a2,t8,L(lastw)
> +       PTR_SUBU a3,t8,a2       /* a3 is count of bytes in one (d)word chunks */
> +       PTR_ADDU a3,a0,a3       /* a3 is the dst address after loop */
>
> -       slti    t0, a2, 8               # Less than 8?
> -       bne     t0, zero, L(last8)
> -       move    v0, a0                  # Setup exit value before too late
> -
> -       xor     t0, a1, a0              # Find a0/a1 displacement
> -       andi    t0, 0x3
> -       bne     t0, zero, L(shift)      # Go handle the unaligned case
> -       subu    t1, zero, a1
> -       andi    t1, 0x3                 # a0/a1 are aligned, but are we
> -       beq     t1, zero, L(chk8w)      #  starting in the middle of a word?
> -       subu    a2, t1
> -       LWHI    t0, 0(a1)               # Yes we are... take care of that
> -       addu    a1, t1
> -       SWHI    t0, 0(a0)
> -       addu    a0, t1
> -
> -L(chk8w):
> -       andi    t0, a2, 0x1f            # 32 or more bytes left?
> -       beq     t0, a2, L(chk1w)
> -       subu    a3, a2, t0              # Yes
> -       addu    a3, a1                  # a3 = end address of loop
> -       move    a2, t0                  # a2 = what will be left after loop
> -L(lop8w):
> -       lw      t0,  0(a1)              # Loop taking 8 words at a time
> -       lw      t1,  4(a1)
> -       lw      t2,  8(a1)
> -       lw      t3, 12(a1)
> -       lw      t4, 16(a1)
> -       lw      t5, 20(a1)
> -       lw      t6, 24(a1)
> -       lw      t7, 28(a1)
> -       addiu   a0, 32
> -       addiu   a1, 32
> -       sw      t0, -32(a0)
> -       sw      t1, -28(a0)
> -       sw      t2, -24(a0)
> -       sw      t3, -20(a0)
> -       sw      t4, -16(a0)
> -       sw      t5, -12(a0)
> -       sw      t6,  -8(a0)
> -       bne     a1, a3, L(lop8w)
> -       sw      t7,  -4(a0)
> -
> -L(chk1w):
> -       andi    t0, a2, 0x3             # 4 or more bytes left?
> -       beq     t0, a2, L(last8)
> -       subu    a3, a2, t0              # Yes, handle them one word at a time
> -       addu    a3, a1                  # a3 again end address
> -       move    a2, t0
> -L(lop1w):
> -       lw      t0, 0(a1)
> -       addiu   a0, 4
> -       addiu   a1, 4
> -       bne     a1, a3, L(lop1w)
> -       sw      t0, -4(a0)
> -
> -L(last8):
> -       blez    a2, L(lst8e)            # Handle last 8 bytes, one at a time
> -       addu    a3, a2, a1
> -L(lst8l):
> -       lb      t0, 0(a1)
> -       addiu   a0, 1
> -       addiu   a1, 1
> -       bne     a1, a3, L(lst8l)
> -       sb      t0, -1(a0)
> -L(lst8e):
> -       jr      ra                      # Bye, bye
> +/* copying in words (4-byte or 8-byte chunks) */
> +L(wordCopy_loop):
> +       C_LD    REG3,UNIT(0)(a1)
> +       PTR_ADDIU a0,a0,UNIT(1)
> +       PTR_ADDIU a1,a1,UNIT(1)
> +       bne     a0,a3,L(wordCopy_loop)
> +       C_ST    REG3,UNIT(-1)(a0)
> +
> +/* If we have been copying double words, see if we can copy a single word
> +   before doing byte copies.  We can have, at most, one word to copy.  */
> +
> +L(lastw):
> +#ifdef USE_DOUBLE
> +       andi    t8,a2,3         /* a2 is the remainder past 4 byte chunks.  */
> +       beq     t8,a2,L(lastb)
> +       lw      REG3,0(a1)
> +       sw      REG3,0(a0)
> +       PTR_ADDIU a0,a0,4
> +       PTR_ADDIU a1,a1,4
> +       move    a2,t8
> +#endif
> +
> +/* Copy the last 8 (or 16) bytes */
> +L(lastb):
> +       blez    a2,L(leave)
> +       PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
> +L(lastbloop):
> +       lb      v1,0(a1)
> +       PTR_ADDIU a0,a0,1
> +       PTR_ADDIU a1,a1,1
> +       bne     a0,a3,L(lastbloop)
> +       sb      v1,-1(a0)
> +L(leave):
> +       j       ra
>         nop
>
> -L(shift):
> -       subu    a3, zero, a0            # Src and Dest unaligned
> -       andi    a3, 0x3                 #  (unoptimized case...)
> -       beq     a3, zero, L(shft1)
> -       subu    a2, a3                  # a2 = bytes left
> -       LWHI    t0, 0(a1)               # Take care of first odd part
> -       LWLO    t0, 3(a1)
> -       addu    a1, a3
> -       SWHI    t0, 0(a0)
> -       addu    a0, a3
> -L(shft1):
> -       andi    t0, a2, 0x3
> -       subu    a3, a2, t0
> -       addu    a3, a1
> -L(shfth):
> -       LWHI    t1, 0(a1)               # Limp through, word by word
> -       LWLO    t1, 3(a1)
> -       addiu   a0, 4
> -       addiu   a1, 4
> -       bne     a1, a3, L(shfth)
> -       sw      t1, -4(a0)
> -       b       L(last8)                # Handle anything which may be left
> -       move    a2, t0
> +/* We jump here with a memcpy of less than 8 or 16 bytes, depending on
> +   whether or not USE_DOUBLE is defined.  Instead of just doing byte
> +   copies, check the alignment and size and use lw/sw if possible.
> +   Otherwise, do byte copies.  */
>
> -       .set    reorder
> -END (memcpy)
> +L(lasts):
> +       andi    t8,a2,3
> +       beq     t8,a2,L(lastb)
> +
> +       andi    t9,a0,3
> +       bne     t9,zero,L(lastb)
> +       andi    t9,a1,3
> +       bne     t9,zero,L(lastb)
> +
> +       PTR_SUBU a3,a2,t8
> +       PTR_ADDU a3,a0,a3
> +
> +L(wcopy_loop):
> +       lw      REG3,0(a1)
> +       PTR_ADDIU a0,a0,4
> +       PTR_ADDIU a1,a1,4
> +       bne     a0,a3,L(wcopy_loop)
> +       sw      REG3,-4(a0)
>
> -#endif /* !__mips64 */
> +       b       L(lastb)
> +       move    a2,t8
>
> -libc_hidden_def(memcpy)
> +#ifndef R6_CODE
> +/*
> + * UNALIGNED case, got here with a3 = "negu a0"
> + * This code is nearly identical to the aligned code above
> + * but only the destination (not the source) gets aligned
> + * so we need to do partial loads of the source followed
> + * by normal stores to the destination (once we have aligned
> + * the destination).
> + */
> +
> +L(unaligned):
> +       andi    a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
> +       beqz    a3,L(ua_chk16w) /* if a3=0, it is already aligned */
> +       PTR_SUBU a2,a2,a3       /* a2 is the remining bytes count */
> +
> +       C_LDHI  v1,UNIT(0)(a1)
> +       C_LDLO  v1,UNITM1(1)(a1)
> +       PTR_ADDU a1,a1,a3
> +       C_STHI  v1,UNIT(0)(a0)
> +       PTR_ADDU a0,a0,a3
> +
> +/*
> + *  Now the destination (but not the source) is aligned
> + * Set a2 to count how many bytes we have to copy after all the 64/128 byte
> + * chunks are copied and a3 to the dst pointer after all the 64/128 byte
> + * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
> + * equals a3.
> + */
> +
> +L(ua_chk16w):
> +       andi    t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
> +       beq     a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
> +       PTR_SUBU a3,a2,t8        /* subtract from a2 the reminder */
> +       PTR_ADDU a3,a0,a3        /* Now a3 is the final dst after loop */
> +
> +# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +       PTR_ADDU t0,a0,a2         /* t0 is the "past the end" address */
> +       PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
> +# endif
> +       PREFETCH_FOR_LOAD  (0, a1)
> +       PREFETCH_FOR_LOAD  (1, a1)
> +       PREFETCH_FOR_LOAD  (2, a1)
> +# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
> +       PREFETCH_FOR_STORE (1, a0)
> +       PREFETCH_FOR_STORE (2, a0)
> +       PREFETCH_FOR_STORE (3, a0)
> +# endif
> +# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
> +#  if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +       sltu    v1,t9,a0
> +       bgtz    v1,L(ua_skip_set)
> +       nop
> +       PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
> +L(ua_skip_set):
> +#  else
> +       PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
> +#  endif
> +# endif
> +L(ua_loop16w):
> +       PREFETCH_FOR_LOAD  (3, a1)
> +       C_LDHI  t0,UNIT(0)(a1)
> +       C_LDHI  t1,UNIT(1)(a1)
> +       C_LDHI  REG2,UNIT(2)(a1)
> +# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +       sltu    v1,t9,a0
> +       bgtz    v1,L(ua_skip_pref)
> +# endif
> +       C_LDHI  REG3,UNIT(3)(a1)
> +       PREFETCH_FOR_STORE (4, a0)
> +       PREFETCH_FOR_STORE (5, a0)
> +L(ua_skip_pref):
> +       C_LDHI  REG4,UNIT(4)(a1)
> +       C_LDHI  REG5,UNIT(5)(a1)
> +       C_LDHI  REG6,UNIT(6)(a1)
> +       C_LDHI  REG7,UNIT(7)(a1)
> +       C_LDLO  t0,UNITM1(1)(a1)
> +       C_LDLO  t1,UNITM1(2)(a1)
> +       C_LDLO  REG2,UNITM1(3)(a1)
> +       C_LDLO  REG3,UNITM1(4)(a1)
> +       C_LDLO  REG4,UNITM1(5)(a1)
> +       C_LDLO  REG5,UNITM1(6)(a1)
> +       C_LDLO  REG6,UNITM1(7)(a1)
> +       C_LDLO  REG7,UNITM1(8)(a1)
> +        PREFETCH_FOR_LOAD (4, a1)
> +       C_ST    t0,UNIT(0)(a0)
> +       C_ST    t1,UNIT(1)(a0)
> +       C_ST    REG2,UNIT(2)(a0)
> +       C_ST    REG3,UNIT(3)(a0)
> +       C_ST    REG4,UNIT(4)(a0)
> +       C_ST    REG5,UNIT(5)(a0)
> +       C_ST    REG6,UNIT(6)(a0)
> +       C_ST    REG7,UNIT(7)(a0)
> +       C_LDHI  t0,UNIT(8)(a1)
> +       C_LDHI  t1,UNIT(9)(a1)
> +       C_LDHI  REG2,UNIT(10)(a1)
> +       C_LDHI  REG3,UNIT(11)(a1)
> +       C_LDHI  REG4,UNIT(12)(a1)
> +       C_LDHI  REG5,UNIT(13)(a1)
> +       C_LDHI  REG6,UNIT(14)(a1)
> +       C_LDHI  REG7,UNIT(15)(a1)
> +       C_LDLO  t0,UNITM1(9)(a1)
> +       C_LDLO  t1,UNITM1(10)(a1)
> +       C_LDLO  REG2,UNITM1(11)(a1)
> +       C_LDLO  REG3,UNITM1(12)(a1)
> +       C_LDLO  REG4,UNITM1(13)(a1)
> +       C_LDLO  REG5,UNITM1(14)(a1)
> +       C_LDLO  REG6,UNITM1(15)(a1)
> +       C_LDLO  REG7,UNITM1(16)(a1)
> +        PREFETCH_FOR_LOAD (5, a1)
> +       C_ST    t0,UNIT(8)(a0)
> +       C_ST    t1,UNIT(9)(a0)
> +       C_ST    REG2,UNIT(10)(a0)
> +       C_ST    REG3,UNIT(11)(a0)
> +       C_ST    REG4,UNIT(12)(a0)
> +       C_ST    REG5,UNIT(13)(a0)
> +       C_ST    REG6,UNIT(14)(a0)
> +       C_ST    REG7,UNIT(15)(a0)
> +       PTR_ADDIU a0,a0,UNIT(16)        /* adding 64/128 to dest */
> +       bne     a0,a3,L(ua_loop16w)
> +       PTR_ADDIU a1,a1,UNIT(16)        /* adding 64/128 to src */
> +       move    a2,t8
> +
> +/* Here we have src and dest word-aligned but less than 64-bytes or
> + * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
> + * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
> + * the copy.  */
> +
> +L(ua_chkw):
> +       PREFETCH_FOR_LOAD (0, a1)
> +       andi    t8,a2,NSIZEMASK   /* Is there a 32-byte/64-byte chunk.  */
> +                                 /* t8 is the reminder count past 32-bytes */
> +       beq     a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
> +       nop
> +       C_LDHI  t0,UNIT(0)(a1)
> +       C_LDHI  t1,UNIT(1)(a1)
> +       C_LDHI  REG2,UNIT(2)(a1)
> +       C_LDHI  REG3,UNIT(3)(a1)
> +       C_LDHI  REG4,UNIT(4)(a1)
> +       C_LDHI  REG5,UNIT(5)(a1)
> +       C_LDHI  REG6,UNIT(6)(a1)
> +       C_LDHI  REG7,UNIT(7)(a1)
> +       C_LDLO  t0,UNITM1(1)(a1)
> +       C_LDLO  t1,UNITM1(2)(a1)
> +       C_LDLO  REG2,UNITM1(3)(a1)
> +       C_LDLO  REG3,UNITM1(4)(a1)
> +       C_LDLO  REG4,UNITM1(5)(a1)
> +       C_LDLO  REG5,UNITM1(6)(a1)
> +       C_LDLO  REG6,UNITM1(7)(a1)
> +       C_LDLO  REG7,UNITM1(8)(a1)
> +       PTR_ADDIU a1,a1,UNIT(8)
> +       C_ST    t0,UNIT(0)(a0)
> +       C_ST    t1,UNIT(1)(a0)
> +       C_ST    REG2,UNIT(2)(a0)
> +       C_ST    REG3,UNIT(3)(a0)
> +       C_ST    REG4,UNIT(4)(a0)
> +       C_ST    REG5,UNIT(5)(a0)
> +       C_ST    REG6,UNIT(6)(a0)
> +       C_ST    REG7,UNIT(7)(a0)
> +       PTR_ADDIU a0,a0,UNIT(8)
> +/*
> + * Here we have less than 32(64) bytes to copy.  Set up for a loop to
> + * copy one word (or double word) at a time.
> + */
> +L(ua_chk1w):
> +       andi    a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
> +       beq     a2,t8,L(ua_smallCopy)
> +       PTR_SUBU a3,t8,a2       /* a3 is count of bytes in one (d)word chunks */
> +       PTR_ADDU a3,a0,a3       /* a3 is the dst address after loop */
> +
> +/* copying in words (4-byte or 8-byte chunks) */
> +L(ua_wordCopy_loop):
> +       C_LDHI  v1,UNIT(0)(a1)
> +       C_LDLO  v1,UNITM1(1)(a1)
> +       PTR_ADDIU a0,a0,UNIT(1)
> +       PTR_ADDIU a1,a1,UNIT(1)
> +       bne     a0,a3,L(ua_wordCopy_loop)
> +       C_ST    v1,UNIT(-1)(a0)
> +
> +/* Copy the last 8 (or 16) bytes */
> +L(ua_smallCopy):
> +       beqz    a2,L(leave)
> +       PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
> +L(ua_smallCopy_loop):
> +       lb      v1,0(a1)
> +       PTR_ADDIU a0,a0,1
> +       PTR_ADDIU a1,a1,1
> +       bne     a0,a3,L(ua_smallCopy_loop)
> +       sb      v1,-1(a0)
> +
> +       j       ra
> +       nop
> +
> +#else /* R6_CODE */
> +
> +# ifdef __MIPSEB
> +#  define SWAP_REGS(X,Y) X, Y
> +#  define ALIGN_OFFSET(N) (N)
> +# else
> +#  define SWAP_REGS(X,Y) Y, X
> +#  define ALIGN_OFFSET(N) (NSIZE-N)
> +# endif
> +# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
> +       andi    REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes.     */ \
> +       beq     REG7, a2, L(lastb); /* Check for bytes to copy by word     */ \
> +       PTR_SUBU a3, a2, REG7;  /* a3 is number of bytes to be copied in   */ \
> +                               /* (d)word chunks.                         */ \
> +       move    a2, REG7;       /* a2 is # of bytes to copy byte by byte   */ \
> +                               /* after word loop is finished.            */ \
> +       PTR_ADDU REG6, a0, a3;  /* REG6 is the dst address after loop.     */ \
> +       PTR_SUBU REG2, a1, t8;  /* REG2 is the aligned src address.        */ \
> +       PTR_ADDU a1, a1, a3;    /* a1 is addr of source after word loop.   */ \
> +       C_LD    t0, UNIT(0)(REG2);  /* Load first part of source.          */ \
> +L(r6_ua_wordcopy##BYTEOFFSET):                                               \
> +       C_LD    t1, UNIT(1)(REG2);  /* Load second part of source.         */ \
> +       C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET);             \
> +       PTR_ADDIU a0, a0, UNIT(1);  /* Increment destination pointer.      */ \
> +       PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
> +       move    t0, t1;         /* Move second part of source to first.    */ \
> +       bne     a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET);                       \
> +       C_ST    REG3, UNIT(-1)(a0);                                           \
> +       j       L(lastb);                                                     \
> +       nop
> +
> +       /* We are generating R6 code, the destination is 4 byte aligned and
> +          the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
> +           alignment of the source.  */
> +
> +L(r6_unaligned1):
> +       R6_UNALIGNED_WORD_COPY(1)
> +L(r6_unaligned2):
> +       R6_UNALIGNED_WORD_COPY(2)
> +L(r6_unaligned3):
> +       R6_UNALIGNED_WORD_COPY(3)
> +# ifdef USE_DOUBLE
> +L(r6_unaligned4):
> +       R6_UNALIGNED_WORD_COPY(4)
> +L(r6_unaligned5):
> +       R6_UNALIGNED_WORD_COPY(5)
> +L(r6_unaligned6):
> +       R6_UNALIGNED_WORD_COPY(6)
> +L(r6_unaligned7):
> +       R6_UNALIGNED_WORD_COPY(7)
> +# endif
> +#endif /* R6_CODE */
> +
> +       .set    at
> +       .set    reorder
> +END(MEMCPY_NAME)
> +#ifndef ANDROID_CHANGES
> +# ifdef _LIBC
> +#  ifdef __UCLIBC__
> +libc_hidden_def(MEMCPY_NAME)
> +#  else
> +libc_hidden_builtin_def (MEMCPY_NAME)
> +#  endif
> +# endif
> +#endif
> diff --git a/libc/string/mips/memset.S b/libc/string/mips/memset.S
> index 26b2598..ef8ab0b 100644
> --- a/libc/string/mips/memset.S
> +++ b/libc/string/mips/memset.S
> @@ -1,6 +1,5 @@
> -/* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
> +/* Copyright (C) 2013-2015 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
> -   Contributed by Hartvig Ekner <hartvige at mips.com>, 2002.
>
>     The GNU C Library is free software; you can redistribute it and/or
>     modify it under the terms of the GNU Lesser General Public
> @@ -13,145 +12,420 @@
>     Lesser General Public License for more details.
>
>     You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> +   License along with the GNU C Library.  If not, see
>     <http://www.gnu.org/licenses/>.  */
>
> -#include <features.h>
> -#include <sysdep.h>
> -#include <endian.h>
> +#ifdef ANDROID_CHANGES
> +# include "machine/asm.h"
> +# include "machine/regdef.h"
> +# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#elif _LIBC
> +# include <sysdep.h>
> +# include <regdef.h>
> +# include <sys/asm.h>
> +# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#elif defined _COMPILING_NEWLIB
> +# include "machine/asm.h"
> +# include "machine/regdef.h"
> +# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#else
> +# include <regdef.h>
> +# include <sys/asm.h>
> +#endif
> +
> +/* Check to see if the MIPS architecture we are compiling for supports
> +   prefetching.  */
> +
> +#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
> +# ifndef DISABLE_PREFETCH
> +#  define USE_PREFETCH
> +# endif
> +#endif
> +
> +#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
> +# ifndef DISABLE_DOUBLE
> +#  define USE_DOUBLE
> +# endif
> +#endif
> +
> +#ifndef USE_DOUBLE
> +# ifndef DISABLE_DOUBLE_ALIGN
> +#  define DOUBLE_ALIGN
> +# endif
> +#endif
> +
> +
> +/* Some asm.h files do not have the L macro definition.  */
> +#ifndef L
> +# if _MIPS_SIM == _ABIO32
> +#  define L(label) $L ## label
> +# else
> +#  define L(label) .L ## label
> +# endif
> +#endif
> +
> +/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
> +#ifndef PTR_ADDIU
> +# ifdef USE_DOUBLE
> +#  define PTR_ADDIU    daddiu
> +# else
> +#  define PTR_ADDIU    addiu
> +# endif
> +#endif
>
> -/* void *memset(void *s, int c, size_t n).  */
> +/* New R6 instructions that may not be in asm.h.  */
> +#ifndef PTR_LSA
> +# if _MIPS_SIM == _ABI64
> +#  define PTR_LSA        dlsa
> +# else
> +#  define PTR_LSA        lsa
> +# endif
> +#endif
> +
> +/* Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
> +   or PREFETCH_STORE_STREAMED offers a large performance advantage
> +   but PREPAREFORSTORE has some special restrictions to consider.
> +
> +   Prefetch with the 'prepare for store' hint does not copy a memory
> +   location into the cache, it just allocates a cache line and zeros
> +   it out.  This means that if you do not write to the entire cache
> +   line before writing it out to memory some data will get zero'ed out
> +   when the cache line is written back to memory and data will be lost.
> +
> +   There are ifdef'ed sections of this memcpy to make sure that it does not
> +   do prefetches on cache lines that are not going to be completely written.
> +   This code is only needed and only used when PREFETCH_STORE_HINT is set to
> +   PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
> +   less than MAX_PREFETCH_SIZE bytes and if the cache line is larger it will
> +   not work correctly.  */
> +
> +#ifdef USE_PREFETCH
> +# define PREFETCH_HINT_STORE           1
> +# define PREFETCH_HINT_STORE_STREAMED  5
> +# define PREFETCH_HINT_STORE_RETAINED  7
> +# define PREFETCH_HINT_PREPAREFORSTORE 30
> +
> +/* If we have not picked out what hints to use at this point use the
> +   standard load and store prefetch hints.  */
> +# ifndef PREFETCH_STORE_HINT
> +#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
> +# endif
> +
> +/* We double everything when USE_DOUBLE is true so we do 2 prefetches to
> +   get 64 bytes in that case.  The assumption is that each individual
> +   prefetch brings in 32 bytes.  */
> +# ifdef USE_DOUBLE
> +#  define PREFETCH_CHUNK 64
> +#  define PREFETCH_FOR_STORE(chunk, reg) \
> +    pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
> +    pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
> +# else
> +#  define PREFETCH_CHUNK 32
> +#  define PREFETCH_FOR_STORE(chunk, reg) \
> +    pref PREFETCH_STORE_HINT, (chunk)*32(reg)
> +# endif
>
> -#ifdef __mips64
> +/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
> +   than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
> +   of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
> +   hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
> +   used than MAX_PREFETCH_SIZE does not matter.  */
> +# define MAX_PREFETCH_SIZE 128
> +/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
> +   than 5 on a STORE prefetch and that a single prefetch can never be larger
> +   than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
> +   we actually do two prefetches in that case, one 32 bytes after the other.  */
> +# ifdef USE_DOUBLE
> +#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
> +# else
> +#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
> +# endif
>
> -#include <sys/asm.h>
> +# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
> +    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
> +/* We cannot handle this because the initial prefetches may fetch bytes that
> +   are before the buffer being copied.  We start copies with an offset
> +   of 4 so avoid this situation when using PREPAREFORSTORE.  */
> +#  error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
> +# endif
> +#else /* USE_PREFETCH not defined */
> +# define PREFETCH_FOR_STORE(offset, reg)
> +#endif
> +
> +#if __mips_isa_rev > 5
> +# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +#  undef PREFETCH_STORE_HINT
> +#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
> +# endif
> +# define R6_CODE
> +#endif
>
> -#if __BYTE_ORDER == __BIG_ENDIAN
> -# define SDHI  sdl             /* high part is left in big-endian      */
> +/* Allow the routine to be named something else if desired.  */
> +#ifndef MEMSET_NAME
> +# define MEMSET_NAME memset
> +#endif
> +
> +/* We load/store 64 bits at a time when USE_DOUBLE is true.
> +   The C_ prefix stands for CHUNK and is used to avoid macro name
> +   conflicts with system header files.  */
> +
> +#ifdef USE_DOUBLE
> +# define C_ST  sd
> +# ifdef __MIPSEB
> +#  define C_STHI       sdl     /* high part is left in big-endian      */
> +# else
> +#  define C_STHI       sdr     /* high part is right in little-endian  */
> +# endif
>  #else
> -# define SDHI  sdr             /* high part is right in little-endian  */
> +# define C_ST  sw
> +# ifdef __MIPSEB
> +#  define C_STHI       swl     /* high part is left in big-endian      */
> +# else
> +#  define C_STHI       swr     /* high part is right in little-endian  */
> +# endif
>  #endif
>
> -ENTRY (memset)
> -       .set    noreorder
> +/* Bookkeeping values for 32 vs. 64 bit mode.  */
> +#ifdef USE_DOUBLE
> +# define NSIZE 8
> +# define NSIZEMASK 0x3f
> +# define NSIZEDMASK 0x7f
> +#else
> +# define NSIZE 4
> +# define NSIZEMASK 0x1f
> +# define NSIZEDMASK 0x3f
> +#endif
> +#define UNIT(unit) ((unit)*NSIZE)
> +#define UNITM1(unit) (((unit)*NSIZE)-1)
>
> -       slti    ta1, a2, 16             # Less than 16?
> -       bne     ta1, zero, L(last16)
> -       move    v0, a0                  # Setup exit value before too late
> -
> -       beq     a1, zero, L(ueven)      # If zero pattern, no need to extend
> -       andi    a1, 0xff                # Avoid problems with bogus arguments
> -       dsll    ta0, a1, 8
> -       or      a1, ta0
> -       dsll    ta0, a1, 16
> -       or      a1, ta0                 # a1 is now pattern in full word
> -       dsll    ta0, a1, 32
> -       or      a1, ta0                 # a1 is now pattern in double word
> -
> -L(ueven):
> -       PTR_SUBU ta0, zero, a0          # Unaligned address?
> -       andi    ta0, 0x7
> -       beq     ta0, zero, L(chkw)
> -       PTR_SUBU a2, ta0
> -       SDHI    a1, 0(a0)               # Yes, handle first unaligned part
> -       PTR_ADDU a0, ta0                # Now both a0 and a2 are updated
> +#ifdef ANDROID_CHANGES
> +LEAF(MEMSET_NAME,0)
> +#else
> +LEAF(MEMSET_NAME)
> +#endif
>
> -L(chkw):
> -       andi    ta0, a2, 0xf            # Enough left for one loop iteration?
> -       beq     ta0, a2, L(chkl)
> -       PTR_SUBU a3, a2, ta0
> -       PTR_ADDU a3, a0                 # a3 is last loop address +1
> -       move    a2, ta0                 # a2 is now # of bytes left after loop
> -L(loopw):
> -       PTR_ADDIU a0, 16                # Handle 2 dwords pr. iteration
> -       sd      a1, -16(a0)
> -       bne     a0, a3, L(loopw)
> -       sd      a1,  -8(a0)
> -
> -L(chkl):
> -       andi    ta0, a2, 0x8            # Check if there is at least a double
> -       beq     ta0, zero, L(last16)    #  word remaining after the loop
> -       PTR_SUBU a2, ta0
> -       sd      a1, 0(a0)               # Yes...
> -       PTR_ADDIU a0, 8
> -
> -L(last16):
> -       blez    a2, L(exit)             # Handle last 16 bytes (if cnt>0)
> -       PTR_ADDU a3, a2, a0             # a3 is last address +1
> -L(lst16l):
> -       PTR_ADDIU a0, 1
> -       bne     a0, a3, L(lst16l)
> -       sb      a1, -1(a0)
> -L(exit):
> -       j       ra                      # Bye, bye
> +       .set    nomips16
> +       .set    noreorder
> +/* If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
> +   size, copy dst pointer to v0 for the return value.  */
> +       slti    t2,a2,(2 * NSIZE)
> +       bne     t2,zero,L(lastb)
> +       move    v0,a0
> +
> +/* If memset value is not zero, we copy it to all the bytes in a 32 or 64
> +   bit word.  */
> +       beq     a1,zero,L(set0)         /* If memset value is zero no smear  */
> +       PTR_SUBU a3,zero,a0
>         nop
>
> -       .set    reorder
> -END (memset)
> +       /* smear byte into 32 or 64 bit word */
> +#if ((__mips == 64) || (__mips == 32)) && (__mips_isa_rev >= 2)
> +# ifdef USE_DOUBLE
> +       dins    a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
> +       dins    a1, a1, 16, 16      /* Replicate fill byte into word.       */
> +       dins    a1, a1, 32, 32      /* Replicate fill byte into dbl word.   */
> +# else
> +       ins     a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
> +       ins     a1, a1, 16, 16      /* Replicate fill byte into word.       */
> +# endif
> +#else
> +# ifdef USE_DOUBLE
> +        and     a1,0xff
> +       dsll    t2,a1,8
> +       or      a1,t2
> +       dsll    t2,a1,16
> +       or      a1,t2
> +       dsll    t2,a1,32
> +       or      a1,t2
> +# else
> +        and     a1,0xff
> +       sll     t2,a1,8
> +       or      a1,t2
> +       sll     t2,a1,16
> +       or      a1,t2
> +# endif
> +#endif
> +
> +/* If the destination address is not aligned do a partial store to get it
> +   aligned.  If it is already aligned just jump to L(aligned).  */
> +L(set0):
> +#ifndef R6_CODE
> +       andi    t2,a3,(NSIZE-1)         /* word-unaligned address?          */
> +       beq     t2,zero,L(aligned)      /* t2 is the unalignment count      */
> +       PTR_SUBU a2,a2,t2
> +       C_STHI  a1,0(a0)
> +       PTR_ADDU a0,a0,t2
> +#else /* R6_CODE */
> +       andi    t2,a0,(NSIZE-1)
> +       lapc    t9,L(atable)
> +       PTR_LSA t9,t2,t9,2
> +       jrc     t9
> +L(atable):
> +       bc      L(aligned)
> +# ifdef USE_DOUBLE
> +       bc      L(lb7)
> +       bc      L(lb6)
> +       bc      L(lb5)
> +       bc      L(lb4)
> +# endif
> +       bc      L(lb3)
> +       bc      L(lb2)
> +       bc      L(lb1)
> +L(lb7):
> +       sb      a1,6(a0)
> +L(lb6):
> +       sb      a1,5(a0)
> +L(lb5):
> +       sb      a1,4(a0)
> +L(lb4):
> +       sb      a1,3(a0)
> +L(lb3):
> +       sb      a1,2(a0)
> +L(lb2):
> +       sb      a1,1(a0)
> +L(lb1):
> +       sb      a1,0(a0)
> +
> +       li      t9,NSIZE
> +       subu    t2,t9,t2
> +       PTR_SUBU a2,a2,t2
> +       PTR_ADDU a0,a0,t2
> +#endif /* R6_CODE */
> +
> +L(aligned):
> +/* If USE_DOUBLE is not set we may still want to align the data on a 16
> +   byte boundry instead of an 8 byte boundry to maximize the opportunity
> +   of proAptiv chips to do memory bonding (combining two sequential 4
> +   byte stores into one 8 byte store).  We know there are at least 4 bytes
> +   left to store or we would have jumped to L(lastb) earlier in the code.  */
> +#ifdef DOUBLE_ALIGN
> +       andi    t2,a3,4
> +       beq     t2,zero,L(double_aligned)
> +       PTR_SUBU a2,a2,t2
> +       sw      a1,0(a0)
> +       PTR_ADDU a0,a0,t2
> +L(double_aligned):
> +#endif
>
> -#else /* !__mips64 */
> +/* Now the destination is aligned to (word or double word) aligned address
> +   Set a2 to count how many bytes we have to copy after all the 64/128 byte
> +   chunks are copied and a3 to the dest pointer after all the 64/128 byte
> +   chunks have been copied.  We will loop, incrementing a0 until it equals
> +   a3.  */
> +       andi    t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
> +       beq     a2,t8,L(chkw)    /* if a2==t8, no 64-byte/128-byte chunks */
> +       PTR_SUBU a3,a2,t8        /* subtract from a2 the reminder */
> +       PTR_ADDU a3,a0,a3        /* Now a3 is the final dst after loop */
>
> -#if __BYTE_ORDER == __BIG_ENDIAN
> -# define SWHI  swl             /* high part is left in big-endian      */
> +/* When in the loop we may prefetch with the 'prepare to store' hint,
> +   in this case the a0+x should not be past the "t0-32" address.  This
> +   means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
> +   for x=64 the last "safe" a0 address is "t0-96" In the current version we
> +   will use "prefetch hint,128(a0)", so "t0-160" is the limit.  */
> +#if defined(USE_PREFETCH) \
> +    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +       PTR_ADDU t0,a0,a2               /* t0 is the "past the end" address */
> +       PTR_SUBU t9,t0,PREFETCH_LIMIT   /* t9 is the "last safe pref" address */
> +#endif
> +#if defined(USE_PREFETCH) \
> +    && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
> +       PREFETCH_FOR_STORE (1, a0)
> +       PREFETCH_FOR_STORE (2, a0)
> +       PREFETCH_FOR_STORE (3, a0)
> +#endif
> +
> +L(loop16w):
> +#if defined(USE_PREFETCH) \
> +    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +       sltu    v1,t9,a0                /* If a0 > t9 don't use next prefetch */
> +       bgtz    v1,L(skip_pref)
> +       nop
> +#endif
> +#ifdef R6_CODE
> +       PREFETCH_FOR_STORE (2, a0)
>  #else
> -# define SWHI  swr             /* high part is right in little-endian  */
> +       PREFETCH_FOR_STORE (4, a0)
> +       PREFETCH_FOR_STORE (5, a0)
>  #endif
> +L(skip_pref):
> +       C_ST    a1,UNIT(0)(a0)
> +       C_ST    a1,UNIT(1)(a0)
> +       C_ST    a1,UNIT(2)(a0)
> +       C_ST    a1,UNIT(3)(a0)
> +       C_ST    a1,UNIT(4)(a0)
> +       C_ST    a1,UNIT(5)(a0)
> +       C_ST    a1,UNIT(6)(a0)
> +       C_ST    a1,UNIT(7)(a0)
> +       C_ST    a1,UNIT(8)(a0)
> +       C_ST    a1,UNIT(9)(a0)
> +       C_ST    a1,UNIT(10)(a0)
> +       C_ST    a1,UNIT(11)(a0)
> +       C_ST    a1,UNIT(12)(a0)
> +       C_ST    a1,UNIT(13)(a0)
> +       C_ST    a1,UNIT(14)(a0)
> +       C_ST    a1,UNIT(15)(a0)
> +       PTR_ADDIU a0,a0,UNIT(16)        /* adding 64/128 to dest */
> +       bne     a0,a3,L(loop16w)
> +       nop
> +       move    a2,t8
>
> -ENTRY (memset)
> -       .set    noreorder
> +/* Here we have dest word-aligned but less than 64-bytes or 128 bytes to go.
> +   Check for a 32(64) byte chunk and copy if if there is one.  Otherwise
> +   jump down to L(chk1w) to handle the tail end of the copy.  */
> +L(chkw):
> +       andi    t8,a2,NSIZEMASK /* is there a 32-byte/64-byte chunk.  */
> +                               /* the t8 is the reminder count past 32-bytes */
> +       beq     a2,t8,L(chk1w)/* when a2==t8, no 32-byte chunk */
> +       nop
> +       C_ST    a1,UNIT(0)(a0)
> +       C_ST    a1,UNIT(1)(a0)
> +       C_ST    a1,UNIT(2)(a0)
> +       C_ST    a1,UNIT(3)(a0)
> +       C_ST    a1,UNIT(4)(a0)
> +       C_ST    a1,UNIT(5)(a0)
> +       C_ST    a1,UNIT(6)(a0)
> +       C_ST    a1,UNIT(7)(a0)
> +       PTR_ADDIU a0,a0,UNIT(8)
> +
> +/* Here we have less than 32(64) bytes to set.  Set up for a loop to
> +   copy one word (or double word) at a time.  Set a2 to count how many
> +   bytes we have to copy after all the word (or double word) chunks are
> +   copied and a3 to the dest pointer after all the (d)word chunks have
> +   been copied.  We will loop, incrementing a0 until a0 equals a3.  */
> +L(chk1w):
> +       andi    a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
> +       beq     a2,t8,L(lastb)
> +       PTR_SUBU a3,t8,a2       /* a3 is count of bytes in one (d)word chunks */
> +       PTR_ADDU a3,a0,a3       /* a3 is the dst address after loop */
>
> -       slti    t1, a2, 8               # Less than 8?
> -       bne     t1, zero, L(last8)
> -       move    v0, a0                  # Setup exit value before too late
> -
> -       beq     a1, zero, L(ueven)      # If zero pattern, no need to extend
> -       andi    a1, 0xff                # Avoid problems with bogus arguments
> -       sll     t0, a1, 8
> -       or      a1, t0
> -       sll     t0, a1, 16
> -       or      a1, t0                  # a1 is now pattern in full word
> -
> -L(ueven):
> -       subu    t0, zero, a0            # Unaligned address?
> -       andi    t0, 0x3
> -       beq     t0, zero, L(chkw)
> -       subu    a2, t0
> -       SWHI    a1, 0(a0)               # Yes, handle first unaligned part
> -       addu    a0, t0                  # Now both a0 and a2 are updated
> -
> -L(chkw):
> -       andi    t0, a2, 0x7             # Enough left for one loop iteration?
> -       beq     t0, a2, L(chkl)
> -       subu    a3, a2, t0
> -       addu    a3, a0                  # a3 is last loop address +1
> -       move    a2, t0                  # a2 is now # of bytes left after loop
> -L(loopw):
> -       addiu   a0, 8                   # Handle 2 words pr. iteration
> -       sw      a1, -8(a0)
> -       bne     a0, a3, L(loopw)
> -       sw      a1, -4(a0)
> -
> -L(chkl):
> -       andi    t0, a2, 0x4             # Check if there is at least a full
> -       beq     t0, zero, L(last8)      #  word remaining after the loop
> -       subu    a2, t0
> -       sw      a1, 0(a0)               # Yes...
> -       addiu   a0, 4
> -
> -L(last8):
> -       blez    a2, L(exit)             # Handle last 8 bytes (if cnt>0)
> -       addu    a3, a2, a0              # a3 is last address +1
> -L(lst8l):
> -       addiu   a0, 1
> -       bne     a0, a3, L(lst8l)
> -       sb      a1, -1(a0)
> -L(exit):
> -       j       ra                      # Bye, bye
> +/* copying in words (4-byte or 8 byte chunks) */
> +L(wordCopy_loop):
> +       PTR_ADDIU a0,a0,UNIT(1)
> +       bne     a0,a3,L(wordCopy_loop)
> +       C_ST    a1,UNIT(-1)(a0)
> +
> +/* Copy the last 8 (or 16) bytes */
> +L(lastb):
> +       blez    a2,L(leave)
> +       PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
> +L(lastbloop):
> +       PTR_ADDIU a0,a0,1
> +       bne     a0,a3,L(lastbloop)
> +       sb      a1,-1(a0)
> +L(leave):
> +       j       ra
>         nop
>
> +       .set    at
>         .set    reorder
> -END (memset)
> -
> -#endif /* !__mips64 */
> +END(MEMSET_NAME)
> +#ifndef ANDROID_CHANGES
> +# ifdef _LIBC
> +#  ifdef __UCLIBC__
> +libc_hidden_def(MEMSET_NAME)
> +#  else
> +libc_hidden_builtin_def (MEMSET_NAME)
> +#  endif
> +# endif
> +#endif
>
> -libc_hidden_def(memset)
> --
> 1.7.9.5
>
> _______________________________________________
> uClibc mailing list
> uClibc at uclibc.org
> http://lists.busybox.net/mailman/listinfo/uclibc