patch-1.3.7 linux/arch/alpha/lib/memcpy.c

Next file: linux/arch/alpha/lib/memset.c
Previous file: linux/arch/alpha/lib/io.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v1.3.6/linux/arch/alpha/lib/memcpy.c linux/arch/alpha/lib/memcpy.c
@@ -5,70 +5,111 @@
  */
 
 /*
- * This is reasonably optimized for the quad-word-aligned case, which
- * happens with page/buffer copies. It's horribly bad for the unaligned
- * case: it could be made much better, but that would require lots of
- * assembly (unaligned 8-byte load + shift + aligned 4-byte store, for
- * example).
+ * This is a reasonably optimized memcpy() routine.
  */
 
-#include <linux/types.h>
+/*
+ * Note that the C code is written to be optimized into good assembly. However,
+ * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
+ * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
+ * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
+ */
 
-static inline void __memcpy_b(unsigned long d, unsigned long s, long n)
-{
-	while (--n >= 0)
-		*(char *) (d++) = *(char *) (s++);
-}
+#include <linux/types.h>
 
-static inline void __memcpy_q(unsigned long d, unsigned long s, long n)
-{
-	/* this first part could be done in one go with ldq_u*2/mask/stq_u */
-	while (d & 7) {
-		if (--n < 0)
-			return;
-		*(char *) d = *(char *) s;
-		d++;
-		s++;
+/*
+ * This should be done in one go with ldq_u*2/mask/stq_u. Do it
+ * with a macro so that we can fix it up later..
+ */
+#define ALIGN_DEST_TO8(d,s,n) \
+	while (d & 7) { \
+		if (n <= 0) return; \
+		n--; \
+		*(char *) d = *(char *) s; \
+		d++; s++; \
 	}
-	while ((n -= 8) >= 0) {
-		*(unsigned long *) d = *(unsigned long *) s;
-		d += 8;
-		s += 8;
+
+/*
+ * This should similarly be done with ldq_u*2/mask/stq. The destination
+ * is aligned, but we don't fill in a full quad-word
+ */
+#define DO_REST(d,s,n) \
+	while (n > 0) { \
+		n--; \
+		*(char *) d = *(char *) s; \
+		d++; s++; \
 	}
-	/* as could this.. */
-	__memcpy_b(d,s,n+8);
-}	
 
-static inline void __memcpy_l(unsigned long d, unsigned long s, long n)
+/*
+ * This should be done with ldq/mask/stq. The source and destination are
+ * aligned, but we don't fill in a full quad-word
+ */
+#define DO_REST_ALIGNED(d,s,n) DO_REST(d,s,n)
+
+/*
+ * This does unaligned memory copies. We want to avoid storing to
+ * an unaligned address, as that would do a read-modify-write cycle.
+ * We also want to avoid double-reading the unaligned reads.
+ *
+ * Note the ordering to try to avoid load (and address generation) latencies.
+ */
+static inline void __memcpy_unaligned(unsigned long d, unsigned long s, long n)
 {
-	while (d & 3) {
-		if (--n < 0)
-			return;
-		*(char *) d = *(char *) s;
-		d++;
-		s++;
+	ALIGN_DEST_TO8(d,s,n);
+	n -= 8;			/* to avoid compare against 8 in the loop */
+	if (n >= 0) {
+		unsigned long low_word, high_word;
+		__asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
+		do {
+			unsigned long tmp;
+			__asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
+			n -= 8;
+			__asm__("extql %1,%2,%0"
+				:"=r" (low_word)
+				:"r" (low_word), "r" (s));
+			__asm__("extqh %1,%2,%0"
+				:"=r" (tmp)
+				:"r" (high_word), "r" (s));
+			s += 8;
+			*(unsigned long *) d = low_word | tmp;
+			d += 8;
+			low_word = high_word;
+		} while (n >= 0);
 	}
-	while ((n -= 4) >= 0) {
-		*(unsigned int *) d = *(unsigned int *) s;
-		d += 4;
-		s += 4;
+	n += 8;
+	DO_REST(d,s,n);
+}
+
+/*
+ * Hmm.. Strange. The __asm__ here is there to make gcc use a integer register
+ * for the load-store. I don't know why, but it would seem that using a floating
+ * point register for the move seems to slow things down (very small difference,
+ * though).
+ *
+ * Note the ordering to try to avoid load (and address generation) latencies.
+ */
+static inline void __memcpy_aligned(unsigned long d, unsigned long s, long n)
+{
+	ALIGN_DEST_TO8(d,s,n);
+	n -= 8;
+	while (n >= 0) {
+		unsigned long tmp;
+		__asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
+		n -= 8;
+		s += 8;
+		*(unsigned long *) d = tmp;
+		d += 8;
 	}
-	__memcpy_b(d,s,n+4);
-}	
+	n += 8;
+	DO_REST_ALIGNED(d,s,n);
+}
 
 void * __memcpy(void * dest, const void *src, size_t n)
 {
-	unsigned long differ;
-	differ = ((unsigned long) dest ^ (unsigned long) src) & 7;
-
-	if (!differ) {
-		__memcpy_q((unsigned long) dest, (unsigned long) src, n);
-		return dest;
-	}
-	if (differ == 4) {
-		__memcpy_l((unsigned long) dest, (unsigned long) src, n);
+	if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
+		__memcpy_aligned((unsigned long) dest, (unsigned long) src, n);
 		return dest;
 	}
-	__memcpy_b((unsigned long) dest, (unsigned long) src, n);
+	__memcpy_unaligned((unsigned long) dest, (unsigned long) src, n);
 	return dest;
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov with Sam's (original) version
of this