patch-2.3.43 linux/arch/ia64/lib/memset.S

Next file: linux/arch/ia64/lib/strlen.S
Previous file: linux/arch/ia64/lib/idiv.S
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.42/linux/arch/ia64/lib/memset.S linux/arch/ia64/lib/memset.S
@@ -0,0 +1,111 @@
+/*
+ *
+ * Optimized version of the standard memset() function
+ *
+ * Return: none
+ *         
+ *
+ * Inputs:
+ *	in0:	address of buffer
+ * 	in1:	byte value to use for storing
+ *	in2:	length of the buffer
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ */
+
+
+// arguments
+//
+#define buf		r32
+#define val		r33
+#define len		r34
+
+//
+// local registers
+//
+#define saved_pfs	r14
+#define cnt		r18
+#define buf2		r19
+#define saved_lc	r20
+#define saved_pr	r21
+#define tmp		r22
+
+ 	.text
+ 	.psr	abi64
+ 	.psr	lsb
+
+ 	.align	16
+ 	.global	memset
+ 	.proc	memset
+
+memset:
+ 	alloc	saved_pfs=ar.pfs,3,0,0,0	// cnt is sink here
+	cmp.eq p8,p0=r0,len	// check for zero length
+	mov saved_lc=ar.lc	// preserve ar.lc (slow)
+	;; 
+	adds tmp=-1,len		// br.ctop is repeat/until
+	tbit.nz p6,p0=buf,0	// odd alignment
+(p8)	br.ret.spnt.few rp
+
+	cmp.lt p7,p0=16,len	// if len > 16 then long memset
+	mux1 val=val,@brcst	// prepare value
+(p7)	br.cond.dptk.few long_memset
+	;;
+	mov ar.lc=tmp		// initialize lc for small count
+	;;			// avoid RAW and WAW on ar.lc
+1:				// worst case 15 cyles, avg 8 cycles
+	st1 [buf]=val,1
+	br.cloop.dptk.few 1b
+	;;				// avoid RAW on ar.lc
+	mov ar.lc=saved_lc
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.few rp	// end of short memset
+
+	// at this point we know we have more than 16 bytes to copy
+	// so we focus on alignment
+long_memset:
+(p6)	st1 [buf]=val,1		// 1-byte aligned
+(p6)	adds len=-1,len;;	// sync because buf is modified
+	tbit.nz p6,p0=buf,1
+	;;
+(p6)	st2 [buf]=val,2		// 2-byte aligned
+(p6)	adds len=-2,len;;
+	tbit.nz p6,p0=buf,2
+	;;
+(p6)	st4 [buf]=val,4		// 4-byte aligned
+(p6)	adds len=-4,len;;
+	tbit.nz p6,p0=buf,3
+	;;
+(p6)	st8 [buf]=val,8		// 8-byte aligned
+(p6)	adds len=-8,len;;
+	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
+	;;	
+	cmp.eq p6,p0=r0,cnt
+	adds tmp=-1,cnt
+(p6)	br.cond.dpnt.few .dotail // we have less than 16 bytes left
+	;;
+	adds buf2=8,buf		// setup second base pointer
+	mov ar.lc=tmp
+	;;
+2:				// 16bytes/iteration
+	st8 [buf]=val,16
+	st8 [buf2]=val,16
+	br.cloop.dptk.few 2b
+	;;
+.dotail:			// tail correction based on len only
+	tbit.nz p6,p0=len,3	
+	;;
+(p6)	st8 [buf]=val,8		// at least 8 bytes
+	tbit.nz p6,p0=len,2	
+	;;
+(p6)	st4 [buf]=val,4		// at least 4 bytes
+	tbit.nz p6,p0=len,1
+	;;
+(p6)	st2 [buf]=val,2		// at least 2 bytes
+	tbit.nz p6,p0=len,0
+	mov ar.lc=saved_lc
+	;;
+(p6)	st1 [buf]=val		// only 1 byte left
+	br.ret.dptk.few rp
+ 	.endp

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)