svn commit: trunk/uClibc/libc/string/x86_64

Tue Apr 15 08:27:24 UTC 2008

Author: vda
Date: 2008-04-15 01:27:24 -0700 (Tue, 15 Apr 2008)
New Revision: 21738

Log:
amd64 string ops: use alignment more carefully, and comment it.
By capping max padding to not be bigger than three next insns,
we avoid having ridiculously big NOPs like this one:

53:66 66 66 66 2e 0f 1f nopw   %cs:0x0(%rax,%rax,1)
5a:84 00 00 00 00 00 

which was bigger than next three insns combined!

Size changes:

   text    data     bss     dec     hex filename
    102       0       0     102      66 x86_64/memcpy.o
    102       0       0     102      66 x86_64.old/memcpy.o

     90       0       0      90      5a x86_64/mempcpy.o
    102       0       0     102      66 x86_64.old/mempcpy.o

    210       0       0     210      d2 x86_64/memset.o
    242       0       0     242      f2 x86_64.old/memset.o

    213       0       0     213      d5 x86_64/stpcpy.o
    220       0       0     220      dc x86_64.old/stpcpy.o

    428       0       0     428     1ac x86_64/strcat.o
    444       0       0     444     1bc x86_64.old/strcat.o

    417       0       0     417     1a1 x86_64/strchr.o
    418       0       0     418     1a2 x86_64.old/strchr.o

     33       0       0      33      21 x86_64/strcmp.o
     33       0       0      33      21 x86_64.old/strcmp.o

    213       0       0     213      d5 x86_64/strcpy.o
    220       0       0     220      dc x86_64.old/strcpy.o

    135       0       0     135      87 x86_64/strcspn.o
    151       0       0     151      97 x86_64.old/strcspn.o

    225       0       0     225      e1 x86_64/strlen.o
    233       0       0     233      e9 x86_64.old/strlen.o

    140       0       0     140      8c x86_64/strpbrk.o
    156       0       0     156      9c x86_64.old/strpbrk.o

    135       0       0     135      87 x86_64/strspn.o
    151       0       0     151      97 x86_64.old/strspn.o

Also, a few files got their .text alignment relaxed from 16 to 8 bytes,
which reduces padding at link time.



Modified:
   trunk/uClibc/libc/string/x86_64/memcpy.S
   trunk/uClibc/libc/string/x86_64/memset.S
   trunk/uClibc/libc/string/x86_64/strcat.S
   trunk/uClibc/libc/string/x86_64/strchr.S
   trunk/uClibc/libc/string/x86_64/strcpy.S
   trunk/uClibc/libc/string/x86_64/strcspn.S
   trunk/uClibc/libc/string/x86_64/strlen.S
   trunk/uClibc/libc/string/x86_64/strspn.S


Changeset:
Modified: trunk/uClibc/libc/string/x86_64/memcpy.S
===================================================================

--- trunk/uClibc/libc/string/x86_64/memcpy.S	2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/memcpy.S	2008-04-15 08:27:24 UTC (rev 21738)
@@ -59,9 +59,9 @@
 	subq	$32, %rcx
 	js	2f
 
-	.p2align 4
+	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+	.p2align 4,,11
 3:
-
 	/* Now correct the loop counter.  Please note that in the following
 	   code the flags are not changed anymore.  */
 	subq	$32, %rcx

Modified: trunk/uClibc/libc/string/x86_64/memset.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/memset.S	2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/memset.S	2008-04-15 08:27:24 UTC (rev 21738)
@@ -55,8 +55,10 @@
 	test	$0x7,%edi	/* Check for alignment.  */
 	jz	2f
 
-	.p2align 4
-1:	/* Align ptr to 8 byte.  */
+	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+	.p2align 4,,9
+1:
+	/* Align ptr to 8 byte.  */
 	mov	%sil,(%rcx)
 	dec	%rdx
 	inc	%rcx
@@ -70,8 +72,10 @@
 	cmp	LARGE, %rdx
 	jae	11f
 
-	.p2align 4
-3:	/* Fill 64 bytes.  */
+	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+	.p2align 4,,11
+3:
+	/* Fill 64 bytes.  */
 	mov	%r8,(%rcx)
 	mov	%r8,0x8(%rcx)
 	mov	%r8,0x10(%rcx)
@@ -114,9 +118,11 @@
 #endif
 	retq
 
-	.p2align 4
-11:	/* Fill 64 bytes without polluting the cache.  */
-	/* We could use	movntdq    %xmm0,(%rcx) here to further
+	/* Next 3 insns are 14 bytes total, make sure we decode them in one go */
+	.p2align 4,,14
+11:
+	/* Fill 64 bytes without polluting the cache.  */
+	/* We could use	movntdq %xmm0,(%rcx) here to further
 	   speed up for large cases but let's not use XMM registers.  */
 	movnti	%r8,(%rcx)
 	movnti  %r8,0x8(%rcx)

Modified: trunk/uClibc/libc/string/x86_64/strcat.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strcat.S	2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strcat.S	2008-04-15 08:27:24 UTC (rev 21738)
@@ -45,7 +45,9 @@
 
 
 	/* Now the source is aligned.  Scan for NUL byte.  */
-	.p2align 4
+
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 4:
 	/* First unroll.  */
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
@@ -103,8 +105,11 @@
 				   the addition will not result in 0.  */
 	jz 4b			/* no NUL found => continue loop */
 
-	.p2align 4		/* Align, it is a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
+	/* Align, it is a jump target.  */
+	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+	.p2align 3,,8
+3:
+	subq $8,%rax		/* correct pointer increment.  */
 
 	testb %cl, %cl		/* is first byte NUL? */
 	jz 2f			/* yes => return */
@@ -160,7 +165,9 @@
 	/* Now the sources is aligned.  Unfortunatly we cannot force
 	   to have both source and destination aligned, so ignore the
 	   alignment of the destination.  */
-	.p2align 4
+
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 22:
 	/* 1st unroll.  */
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
@@ -237,7 +244,9 @@
 
 	/* Do the last few bytes. %rax contains the value to write.
 	   The loop is unrolled twice.  */
-	.p2align 4
+
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
 23:
 	movb	%al, (%rdx)	/* 1st byte.  */
 	testb	%al, %al	/* Is it NUL.  */

Modified: trunk/uClibc/libc/string/x86_64/strchr.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strchr.S	2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strchr.S	2008-04-15 08:27:24 UTC (rev 21738)
@@ -92,7 +92,8 @@
 	 each of whose bytes is C.  This turns each byte that is C
 	 into a zero.  */
 
-	.p2align 4
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 4:
 	/* Main Loop is unrolled 4 times.  */
 	/* First unroll.  */
@@ -230,8 +231,11 @@
 	   reversed.  */
 
 
-	.p2align 4		/* Align, it's a jump target.  */
-3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
+	/* Align, it's a jump target.  */
+	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+	.p2align 4,,9
+3:
+	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
 	subq	$8,%rax		/* correct pointer increment.  */
 	testb %cl, %cl		/* is first byte C? */
 	jz 6f			/* yes => return pointer */
@@ -281,7 +285,7 @@
 	incq %rax
 
 6:
-	nop
+	/* nop - huh?? */
 	retq
 END (BP_SYM (strchr))
 

Modified: trunk/uClibc/libc/string/x86_64/strcpy.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strcpy.S	2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strcpy.S	2008-04-15 08:27:24 UTC (rev 21738)
@@ -53,7 +53,9 @@
 	/* Now the sources is aligned.  Unfortunatly we cannot force
 	   to have both source and destination aligned, so ignore the
 	   alignment of the destination.  */
-	.p2align 4
+
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 1:
 	/* 1st unroll.  */
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
@@ -130,7 +132,9 @@
 
 	/* Do the last few bytes. %rax contains the value to write.
 	   The loop is unrolled twice.  */
-	.p2align 4
+
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
 3:
 	/* Note that stpcpy needs to return with the value of the NUL
 	   byte.  */

Modified: trunk/uClibc/libc/string/x86_64/strcspn.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strcspn.S	2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strcspn.S	2008-04-15 08:27:24 UTC (rev 21738)
@@ -55,7 +55,9 @@
    Although all the following instruction only modify %cl we always
    have a correct zero-extended 64-bit value in %rcx.  */
 
-	.p2align 4
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
+
 L(2):	movb (%rax), %cl	/* get byte from skipset */
 	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
@@ -88,7 +90,13 @@
 	   value in the table.  But the value of NUL is NUL so the loop
 	   terminates for NUL in every case.  */
 
-	.p2align 4
+	/* Next 3 insns are 9 bytes total. */
+	/* .p2align 4,,9 would make sure we decode them in one go, */
+	/* but it will also align entire function to 16 bytes, */
+	/* potentially creating largish padding at link time. */
+	/* We are aligning to 8 bytes instead: */
+	.p2align 3,,8
+
 L(3):	addq $4, %rax		/* adjust pointer for full loop round */
 
 	movb (%rax), %cl	/* get byte from string */

Modified: trunk/uClibc/libc/string/x86_64/strlen.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strlen.S	2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strlen.S	2008-04-15 08:27:24 UTC (rev 21738)
@@ -40,8 +40,11 @@
 
 1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
 
-	.p2align 4		/* Align loop.  */
-4:	/* Main Loop is unrolled 4 times.  */
+	/* Align loop.  */
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
+4:
+	/* Main Loop is unrolled 4 times.  */
 	/* First unroll.  */
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
 	addq $8,%rax		/* adjust pointer for next word */
@@ -98,8 +101,11 @@
 				   the addition will not result in 0.  */
 	jz 4b			/* no NUL found => continue loop */
 
-	.p2align 4		/* Align, it is a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
+	/* Align, it is a jump target.  */
+	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+	.p2align 3,,8
+3:
+	subq $8,%rax		/* correct pointer increment.  */
 
 	testb %cl, %cl		/* is first byte NUL? */
 	jz 2f			/* yes => return */

Modified: trunk/uClibc/libc/string/x86_64/strspn.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strspn.S	2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strspn.S	2008-04-15 08:27:24 UTC (rev 21738)
@@ -50,8 +50,10 @@
    Although all the following instruction only modify %cl we always
    have a correct zero-extended 64-bit value in %rcx.  */
 
-	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from stopset */
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
+L(2):
+	movb (%rax), %cl	/* get byte from stopset */
 	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
@@ -83,8 +85,14 @@
 	   value in the table.  But the value of NUL is NUL so the loop
 	   terminates for NUL in every case.  */
 
-	.p2align 4
-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+	/* Next 3 insns are 9 bytes total. */
+	/* .p2align 4,,9 would make sure we decode them in one go, */
+	/* but it will also align entire function to 16 bytes, */
+	/* potentially creating largish padding at link time. */
+	/* We are aligning to 8 bytes instead: */
+	.p2align 3,,8
+L(3):
+	addq $4, %rax		/* adjust pointer for full loop round */
 
 	movb (%rax), %cl	/* get byte from string */
 	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */