[PATCH 1/5] string: Make tests birdirectional test-memcpy.c

unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH 1/5] string: Make tests birdirectional test-memcpy.c
@ 2021-08-24  8:27 Noah Goldstein via Libc-alpha
  2021-08-24  8:27 ` [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
                   ` (5 more replies)
  0 siblings, 6 replies; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24  8:27 UTC (permalink / raw)
  To: libc-alpha

This commit updates the memcpy tests to test both dst > src and dst <
src. This is because there is logic in the code based on the
condition.
---
 string/test-memcpy.c  | 125 +++++++++++++++++++++++++++++++++---------
 string/test-memmove.c |  73 +++++++++++++++++++++++-
 2 files changed, 170 insertions(+), 28 deletions(-)

diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index c9dfc88fed..705d79ba13 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -79,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
 static void
 do_test (size_t align1, size_t align2, size_t len)
 {
-  size_t i, j;
+  size_t i, j, repeats;
   char *s1, *s2;
 
   align1 &= 4095;
@@ -92,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
 
   s1 = (char *) (buf1 + align1);
   s2 = (char *) (buf2 + align2);
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      for (i = 0, j = 1; i < len; i++, j += 23)
+        s1[i] = j;
 
-  for (i = 0, j = 1; i < len; i++, j += 23)
-    s1[i] = j;
-
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len);
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (impl, s2, s1, len);
+    }
 }
 
 static void
@@ -213,56 +215,88 @@ do_random_tests (void)
 }
 
 static void
-do_test1 (size_t size)
+do_test1 (size_t align1, size_t align2, size_t size)
 {
   void *large_buf;
-  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANON, -1, 0);
+  size_t mmap_size, region_size;
+
+  align1 &= (page_size - 1);
+  if (align1 == 0)
+    align1 = page_size;
+
+  align2 &= (page_size - 1);
+  if (align2 == 0)
+    align2 = page_size;
+
+  region_size = (size + page_size - 1) & (~(page_size - 1));
+
+  mmap_size = region_size * 2 + 3 * page_size;
+  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANON, -1, 0);
   if (large_buf == MAP_FAILED)
     {
-      puts ("Failed to allocat large_buf, skipping do_test1");
+      puts ("Failed to allocate large_buf, skipping do_test1");
       return;
     }
-
-  if (mprotect (large_buf + size, page_size, PROT_NONE))
+  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
     error (EXIT_FAILURE, errno, "mprotect failed");
 
-  size_t arrary_size = size / sizeof (uint32_t);
-  uint32_t *dest = large_buf;
-  uint32_t *src = large_buf + size + page_size;
+  size_t array_size = size / sizeof (uint32_t);
+  uint32_t *dest = large_buf + align1;
+  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
   size_t i;
   size_t repeats;
   for(repeats = 0; repeats < 2; repeats++)
     {
-      for (i = 0; i < arrary_size; i++)
+      for (i = 0; i < array_size; i++)
         src[i] = (uint32_t) i;
-
       FOR_EACH_IMPL (impl, 0)
         {
-            printf ("\t\tRunning: %s\n", impl->name);
+            //            printf ("\t\tRunning: %s\n", impl->name);
           memset (dest, -1, size);
           CALL (impl, (char *) dest, (char *) src, size);
-          for (i = 0; i < arrary_size; i++)
+          for (i = 0; i < array_size; i++)
         if (dest[i] != src[i])
           {
             error (0, 0,
                "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
                impl->name, dest, src, i);
             ret = 1;
-            munmap ((void *) large_buf, size * 2 + page_size);
+            munmap ((void *) large_buf, mmap_size);
             return;
           }
         }
-      dest = src;
-      src = large_buf;
+      dest = large_buf + region_size + 2 * page_size + align1;
+      src = large_buf + align2;
+    }
+  munmap ((void *) large_buf, mmap_size);
+}
+
+static void
+do_random_large_tests (void)
+{
+  size_t i, align1, align2, size;
+  for (i = 0; i < 32; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 0x1000000) + 0x200000;
+      do_test1 (align1, align2, size);
+    }
+
+  for (i = 0; i < 128; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 32768) + 4096;
+      do_test1 (align1, align2, size);
     }
-  munmap ((void *) large_buf, size * 2 + page_size);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  size_t i, j;
 
   test_init ();
 
@@ -299,6 +333,7 @@ test_main (void)
   for (i = 19; i <= 25; ++i)
     {
       do_test (255, 0, 1 << i);
+      do_test (0, 4000, 1 << i);
       do_test (0, 255, i);
       do_test (0, 4000, i);
     }
@@ -307,8 +342,46 @@ test_main (void)
 
   do_random_tests ();
 
-  do_test1 (0x100000);
-  do_test1 (0x2000000);
+  do_test1 (0, 0, 0x100000);
+  do_test1 (0, 0, 0x2000000);
+
+  for (i = 4096; i < 32768; i += 4096)
+    {
+      for (j = 1; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+
+  for (i = 0x300000; i < 0x2000000; i += 0x235689)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+
+  do_random_large_tests ();
   return ret;
 }
 
diff --git a/string/test-memmove.c b/string/test-memmove.c
index 670094c9dc..5ba79acf61 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -101,11 +101,11 @@ do_test (size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize() - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize() - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -356,6 +356,51 @@ do_test3 (size_t bytes_move, size_t offset)
   munmap ((void *) buf, size);
 }
 
+static void
+do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
+{
+  size_t size, repeats, i;
+  uint8_t *buf, *dst, *src;
+
+  size = bytes_move + MAX(offset1, offset2);
+  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANON, -1, 0);
+
+  if (buf == MAP_FAILED)
+    error (EXIT_UNSUPPORTED, errno, "mmap failed");
+
+  dst = &buf[offset1];
+  src = &buf[offset2];
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      FOR_EACH_IMPL (impl, 0)
+        {
+          for (i = 0; i < bytes_move; i++)
+              src[i] = (uint8_t) i;
+#ifdef TEST_BCOPY
+          CALL (impl, (char *) src, (char *) dst, bytes_move);
+#else
+          CALL (impl, (char *) dst, (char *) src, bytes_move);
+#endif
+          for (i = 0; i < bytes_move; i++)
+            {
+              if (dst[i] != (uint8_t) i)
+                {
+                  error (0, 0,
+                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+                         impl->name, dst, buf, i);
+                  ret = 1;
+                  break;
+                }
+            }
+        }
+      dst = &buf[offset2];
+      src = &buf[offset1];
+    }
+  munmap ((void *) buf, size);
+}
+
+
 int
 test_main (void)
 {
@@ -396,13 +441,37 @@ test_main (void)
 
   do_random_tests ();
 
+  do_test2 (0);
   do_test2 (33);
   do_test2 (0x200000);
+  do_test2 (0x200000 - 1);
+  do_test2 (0x200000 + 1);
+  do_test2 (0x1000000 + 1);
   do_test2 (0x4000000 - 1);
   do_test2 (0x4000000);
 
+
   /* Copy 16KB data.  */
   do_test3 (16384, 3);
+  for (i = 4096; i <= 16384; i <<= 1)
+    {
+      do_test4 (i, 0, i);
+      do_test4 (i, 0, i - 1);
+      do_test4 (i, 0, i + 1);      
+      do_test4 (i, 63, i + 63);
+      do_test4 (i, 63, i + 64);
+      do_test4 (i, 63, i);
+
+      do_test4 (i, 0, 1);
+      do_test4 (i, 0, 15);
+      do_test4 (i, 0, 31);
+      do_test4 (i, 0, 63);
+      do_test4 (i, 0, 64);
+      do_test4 (i, 0, 65);
+      do_test4 (i, 0, 127);
+      do_test4 (i, 0, 129);
+    }
+
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c
  2021-08-24  8:27 [PATCH 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein via Libc-alpha
@ 2021-08-24  8:27 ` Noah Goldstein via Libc-alpha
  2021-08-24 15:18   ` H.J. Lu via Libc-alpha
  2021-08-24  8:27 ` [PATCH 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein via Libc-alpha
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24  8:27 UTC (permalink / raw)
  To: libc-alpha

This commit adds three new benchmarks for the SPEC2017
distribution. One randomized if dst > src and the other two set it
either 1/0.

As well add some tests for fixed sizes with randomize alignment and
value of dst > src. This can be useful for testing different alignment
configurations.
---
 benchtests/bench-memcpy-random.c | 107 +++++++++++++++++++++++++++----
 1 file changed, 96 insertions(+), 11 deletions(-)

diff --git a/benchtests/bench-memcpy-random.c b/benchtests/bench-memcpy-random.c
index c490b73ed0..28e0acb05f 100644
--- a/benchtests/bench-memcpy-random.c
+++ b/benchtests/bench-memcpy-random.c
@@ -16,7 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define MIN_PAGE_SIZE (512*1024+getpagesize())
+#define MAX_TEST_SIZE (512*1024)
+#define MIN_PAGE_SIZE (3*MAX_TEST_SIZE+3*getpagesize())
 #define TEST_MAIN
 #define TEST_NAME "memcpy"
 #include "bench-string.h"
@@ -89,9 +90,12 @@ static align_data_t dst_align_freq[] =
 
 typedef struct
 {
-  uint64_t src : 24;
-  uint64_t dst : 24;
-  uint64_t len : 16;
+/* 26 bits for src and dst so we have extra bit for alternating dst >
+   src without a branch.  */
+  uint64_t src : 26;
+  uint64_t dst : 26;
+  /* For size < 4096 12 bits is enough.  */
+  uint64_t len : 12;
 } copy_t;
 
 static copy_t copy[MAX_COPIES];
@@ -142,34 +146,100 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 }
 
 static void
-do_test (json_ctx_t *json_ctx, size_t max_size)
+do_one_fixed_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
+               copy_t *copy, size_t n, size_t size)
 {
-  int i;
+  timing_t start, stop, cur;
+  size_t iters = INNER_LOOP_ITERS_SMALL;
 
-  memset (buf1, 1, max_size);
+  for (int j = 0; j < n; j++)
+    CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
 
-  /* Create a random set of copies with the given size and alignment
+  TIMING_NOW (start);
+  for (int i = 0; i < iters; ++i)
+    for (int j = 0; j < n; j++)
+      CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  json_element_double (json_ctx, (double) cur / (double) iters);
+}
+
+
+static size_t
+init_copy(size_t max_size, int dst_gt_src)
+{
+  size_t i, dst_offset, src_offset;
+  if (dst_gt_src <= 0)
+    {
+      dst_offset = 0;
+      src_offset = MAX_TEST_SIZE + getpagesize();
+    }
+  else
+    {
+      dst_offset = MAX_TEST_SIZE + getpagesize();
+      src_offset = 0;
+    }
+
+    /* Create a random set of copies with the given size and alignment
      distributions.  */
   for (i = 0; i < MAX_COPIES; i++)
     {
+      dst_offset  = dst_gt_src == -1
+                        ? (rand() & 1) ? MAX_TEST_SIZE + getpagesize() : 0
+                        : dst_offset;
       copy[i].dst = (rand () & (max_size - 1));
       copy[i].dst &= ~dst_align_arr[rand () & ALIGN_MASK];
+      copy[i].dst += dst_offset;
       copy[i].src = (rand () & (max_size - 1));
       copy[i].src &= ~src_align_arr[rand () & ALIGN_MASK];
+      copy[i].src += src_offset;
       copy[i].len = size_arr[rand () & SIZE_MASK];
     }
+  return i;
+}
 
+static void
+do_test (json_ctx_t *json_ctx, size_t max_size, int dst_gt_src)
+{
+  size_t n;
+  memset (buf1, 1, max_size);
+  n = init_copy(max_size, dst_gt_src);
   json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) max_size);
+  json_attr_uint (json_ctx, "max-alignment", (double) max_size);
+  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
+  json_attr_uint (json_ctx, "with-fixed-size", (double) 0);
   json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, i);
+    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, n);
 
   json_array_end (json_ctx);
   json_element_object_end (json_ctx);
 }
 
+static void
+do_test_fixed_size (json_ctx_t *json_ctx, size_t size, size_t max_size, int dst_gt_src)
+{
+  size_t n;
+  memset (buf1, 1, max_size);
+  n = init_copy(max_size, dst_gt_src);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "max-alignment", (double) max_size);
+  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
+  json_attr_uint (json_ctx, "with-fixed-size", (double) 1);
+  json_attr_uint (json_ctx, "size", (double) size);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_fixed_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, n, size);
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+
 int
 test_main (void)
 {
@@ -194,7 +264,22 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
   for (int i = 4; i <= 512; i = i * 2)
-    do_test (&json_ctx, i * 1024);
+    {
+      if (i * 1024 > MAX_TEST_SIZE)
+          continue;
+      do_test (&json_ctx, i * 1024, 0);
+      do_test (&json_ctx, i * 1024, 1);
+      do_test (&json_ctx, i * 1024, -1);
+    }
+
+  for (int i = 4; i <= 64; i = i * 2)
+    {
+      if (i * 1024 > MAX_TEST_SIZE)
+          continue;
+      do_test_fixed_size (&json_ctx, i * 256, i * 1024, 0);
+      do_test_fixed_size (&json_ctx, i * 256, i * 1024, 1);
+      do_test_fixed_size (&json_ctx, i * 256, i * 1024, -1);
+    }
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c
  2021-08-24  8:27 [PATCH 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein via Libc-alpha
  2021-08-24  8:27 ` [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
@ 2021-08-24  8:27 ` Noah Goldstein via Libc-alpha
  2021-08-24 15:18   ` H.J. Lu via Libc-alpha
  2021-08-24  8:27 ` [PATCH 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein via Libc-alpha
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24  8:27 UTC (permalink / raw)
  To: libc-alpha

This commit adds a new partial overlap benchmark. This is generally
the most interesting performance case for memmove and was missing.
---
 benchtests/bench-memmove-walk.c | 67 ++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c
index b5fdb2a422..18b716f5cb 100644
--- a/benchtests/bench-memmove-walk.c
+++ b/benchtests/bench-memmove-walk.c
@@ -36,6 +36,10 @@
 # define TIMEOUT (20 * 60)
 # include "bench-string.h"
 
+#define NO_OVERLAP 0
+#define PARTIAL_OVERLAP 1
+#define COMPLETE_OVERLAP 2
+
 IMPL (memmove, 1)
 #endif
 
@@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 }
 
 static void
-do_test (json_ctx_t *json_ctx, size_t len, bool overlap)
+do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways)
 {
-  json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) len);
-  json_array_begin (json_ctx, "timings");
-
-  if (overlap)
-    buf2 = buf1;
-
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
-
-  json_array_end (json_ctx);
-  json_element_object_end (json_ctx);
+  char *s1, *s2, *tmp;    
+  size_t repeats;
+
+  s1 = (char *) (buf1);
+  s2 = (char *) (buf2);
+  if (overlap != NO_OVERLAP)
+    s2 = s1;
+  if (overlap == PARTIAL_OVERLAP)
+    s2 += len / 2;
+
+  for (repeats = both_ways ? 2 : 1; repeats; --repeats)
+    {    
+      json_element_object_begin (json_ctx);
+      json_attr_uint (json_ctx, "length", (double) len);
+      json_attr_string(json_ctx, "overlap",
+                       overlap == NO_OVERLAP        ? "none"
+                       : overlap == PARTIAL_OVERLAP ? "partial"
+                                                    : "complete");
+      json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1));      
+      json_array_begin (json_ctx, "timings");
+
+
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
+
+      json_array_end (json_ctx);
+      json_element_object_end (json_ctx);
+
+      tmp = s1;
+      s1 = s2;
+      s2 = tmp;
+    }
 }
 
 int
@@ -107,15 +131,22 @@ test_main (void)
   /* Non-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, false);
-      do_test (&json_ctx, i + 1, false);
+      do_test (&json_ctx, i, NO_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, NO_OVERLAP, 1);
+    }
+
+  /* Partially-overlapping buffers.  */
+  for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1)
+    {
+      do_test (&json_ctx, i, PARTIAL_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1);
     }
 
-  /* Overlapping buffers.  */
+  /* Complete-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, true);
-      do_test (&json_ctx, i + 1, true);
+      do_test (&json_ctx, i, COMPLETE_OVERLAP, 0);
+      do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0);
     }
 
   json_array_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c
  2021-08-24  8:27 [PATCH 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein via Libc-alpha
  2021-08-24  8:27 ` [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
  2021-08-24  8:27 ` [PATCH 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein via Libc-alpha
@ 2021-08-24  8:27 ` Noah Goldstein via Libc-alpha
  2021-08-24 15:19   ` H.J. Lu via Libc-alpha
  2021-08-24  8:27 ` [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S Noah Goldstein via Libc-alpha
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24  8:27 UTC (permalink / raw)
  To: libc-alpha

This commit adds more benchmarks for the common memcpy/memmove
benchmarks. The most signifcant cases are the half page offsets. The
current versions leaves dst and src near page aligned which leads to
false 4k aliasing on x86_64. This can add noise due to false
dependencies from one run to the next. As well, this seems like more
of an edge case that common case so it shouldn't be the only thing
benchmarked.
---
 benchtests/bench-memcpy.c  | 42 ++++++++++++++++++++++++++++++++++----
 benchtests/bench-memmove.c | 21 +++++++++++++++++--
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
index d9236a2282..b9e661c997 100644
--- a/benchtests/bench-memcpy.c
+++ b/benchtests/bench-memcpy.c
@@ -60,11 +60,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
   size_t i, j;
   char *s1, *s2;
   size_t repeats;
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -99,7 +99,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
-
+  size_t half_page = getpagesize () / 2;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -121,8 +121,15 @@ test_main (void)
     {
       do_test (&json_ctx, 0, 0, 1 << i, 1);
       do_test (&json_ctx, i, 0, 1 << i, 1);
+      do_test (&json_ctx, i + 32, 0, 1 << i, 1);
       do_test (&json_ctx, 0, i, 1 << i, 1);
+      do_test (&json_ctx, 0, i + 32, 1 << i, 1);
       do_test (&json_ctx, i, i, 1 << i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 1 << i, 1);
+      do_test (&json_ctx, half_page, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page, i, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, i, 1 << i, 1);
     }
 
   for (i = 0; i < 32; ++i)
@@ -131,6 +138,12 @@ test_main (void)
       do_test (&json_ctx, i, 0, i, 0);
       do_test (&json_ctx, 0, i, i, 0);
       do_test (&json_ctx, i, i, i, 0);
+      do_test (&json_ctx, half_page, 0, i, 0);
+      do_test (&json_ctx, half_page + i, 0, i, 0);
+      do_test (&json_ctx, half_page, i, i, 0);
+      do_test (&json_ctx, half_page + i, i, i, 0);
+      do_test (&json_ctx, getpagesize () - 1, 0, i, 0);
+      do_test (&json_ctx, 0, getpagesize () - 1, i, 0);
     }
 
   for (i = 3; i < 32; ++i)
@@ -141,6 +154,10 @@ test_main (void)
       do_test (&json_ctx, i, 0, 16 * i, 1);
       do_test (&json_ctx, 0, i, 16 * i, 1);
       do_test (&json_ctx, i, i, 16 * i, 1);
+      do_test (&json_ctx, half_page, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page, i, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 16 * i, 1);
     }
 
   for (i = 32; i < 64; ++i)
@@ -149,16 +166,33 @@ test_main (void)
       do_test (&json_ctx, i, 0, 32 * i, 1);
       do_test (&json_ctx, 0, i, 32 * i, 1);
       do_test (&json_ctx, i, i, 32 * i, 1);
+      do_test (&json_ctx, half_page, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page, i, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 32 * i, 1);
     }
 
   do_test (&json_ctx, 0, 0, getpagesize (), 1);
 
-  for (i = 0; i <= 32; ++i)
+  for (i = 0; i <= 48; ++i)
     {
       do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, 0, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1);
     }
 
   json_array_end (&json_ctx);
diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
index 6becbf4782..bec1455f7b 100644
--- a/benchtests/bench-memmove.c
+++ b/benchtests/bench-memmove.c
@@ -53,11 +53,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -85,6 +85,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
+  size_t half_page = getpagesize () / 2;
 
   test_init ();
 
@@ -138,6 +139,22 @@ test_main (void)
       do_test (&json_ctx, i, i, 32 * i);
     }
 
+  for (i = 0; i <= 48; ++i)
+    {
+      do_test (&json_ctx, 0, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page + i, 2048 + 64 * i);
+    }
+
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S
  2021-08-24  8:27 [PATCH 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein via Libc-alpha
                   ` (2 preceding siblings ...)
  2021-08-24  8:27 ` [PATCH 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein via Libc-alpha
@ 2021-08-24  8:27 ` Noah Goldstein via Libc-alpha
  2021-08-24  9:12   ` Noah Goldstein via Libc-alpha
  2021-08-24 15:17 ` [PATCH 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu via Libc-alpha
  2021-08-24 19:32 ` [PATCH v1 " Noah Goldstein via Libc-alpha
  5 siblings, 1 reply; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24  8:27 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes memmove-vec-unaligned.S.

The optimizations are in descending order of importance to the
L(less_vec), L(movsb), the 8x forward/backward loops and various
target alignments that have minimal code size impact.

The L(less_vec) optimizations are to:

    1. Readjust the branch order to either given hotter paths a fall
    through case or have less branches in there way.
    2. Moderately change the size classes to make hot branches hotter
    and thus increase predictability.
    3. Try and minimize branch aliasing to avoid BPU thrashing based
    misses.
    4. 64 byte the prior function entry. This is to avoid cases where
    seemingly unrelated changes end up have severe negative
    performance impacts.

The L(movsb) optimizations are to:

    1. Reduce the number of taken branches needed to determine if
    movsb should be used.
    2. 64 byte align either dst if the CPU has fsrm or if dst and src
    do not 4k alias.
    3. 64 byte align src if the CPU does not have fsrm and dst and src
    do 4k alias.

The 8x forward/backward loop optimizations are to:

    1. Reduce instructions needed for aligning to VEC_SIZE.
    2. Reduce uops and code size of the loops.

All tests in string/ passing.
---
See performance data attached.
Included benchmarks: memcpy-random, memcpy, memmove, memcpy-walk, memmove-walk, memcpy-large  

The first page is a summary with the ifunc selection version for
erms/non-erms for each computers. Then in the following 4 sheets are
all the numbers for sse2, avx for Skylake and sse2, avx2, evex, and
avx512 for Tigerlake.

Benchmark CPUS: Skylake:
https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html

Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

All times are geometric mean of N=30.

"Cur" refers to the current implementation "New" refers to this
patches implementation

Score refers to new/cur (low means improvement, high means
degragation). Scores are color coded. The more green the better, the
more red the worse.


Some notes on the numbers:

In my opinion most of the benchmarks where src/dst align are in [0,
64] have some unpredictable and unfortunate noise from non-obvious
false dependencies between stores to dst and next iterations loads
from src. For example in the 8x forward case, the store of VEC(4) will
end up stalling next iterations load queue, so if size was large
enough that the begining of dst was flushed from L1 this can have a
seemingly random but significant impact on the benchmark result.

There are significant performance improvements/degregations in the [0,
VEC_SIZE]. I didn't treat these as imporant as I think in this size
range the branch pattern indicated by the random tests is more
important. On the random tests the new implementation performance
significantly better.

I also added logic to align before L(movsb). If you see the new random
benchmarks with fixed size this leads to roughly a 10-20% performance
improvement for some hot sizes. I am not 100% convinced this is needed
as generally for larger copies that would go to movsb they are already
aligned but even in the fixed loop cases, especially on Skylake w.o
FSRM it seems aligning before movsb pays off. Let me know if you think
this is unnecessary.

There are occasional performance degregations at odd splots throughout
the medium range sizes in the fixed memcpy benchmarks. I think
generally there is more good than harm here and at the moment I don't
have an explination for why these certain configurations seem to
perform worse. On the plus side, however, it also seems that there are
unexplained improvements of the same magnitude patterened with the
degregations (and both are sparse) so I ultimately believe it should
be acceptable. if this is not the case let me know.

The memmove benchmarks look a bit worse, especially for the erms
case. Part of this is from the nop cases which I didn't treat as
important. But part of it is also because to optimize for what I
expect to be the common case of no overlap the overlap case has extra
branches and overhead. I think this is inevitable when implementing
memmove and memcpy in the same file, but if this is unacceptable let
me know.


Note: I benchmarks before two changes that made it into the final version:

-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-       ret
-#else
+       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
        VZEROUPPER_RETURN
-#endif



And

+       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)


I don't think either of these should have any impact.

I made the former change because I think it was a bug that could cause
use of avx2 w.o vzeroupper and the latter because I think it could
cause issues on multicore platforms.

    
 sysdeps/x86/sysdep.h                          |  13 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 484 +++++++++++-------
 2 files changed, 317 insertions(+), 180 deletions(-)

diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index cac1d762fb..9226d2c6c9 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -78,15 +78,18 @@ enum cf_protection_level
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
 
 /* Define an entry point visible from C.  */
-#define	ENTRY(name)							      \
-  .globl C_SYMBOL_NAME(name);						      \
-  .type C_SYMBOL_NAME(name),@function;					      \
-  .align ALIGNARG(4);							      \
+#define	P2ALIGN_ENTRY(name, alignment)							      \
+  .globl C_SYMBOL_NAME(name);							      \
+  .type C_SYMBOL_NAME(name),@function;							      \
+  .align ALIGNARG(alignment);							      \
   C_LABEL(name)								      \
   cfi_startproc;							      \
-  _CET_ENDBR;								      \
+  _CET_ENDBR;							      \
   CALL_MCOUNT
 
+#define	ENTRY(name) P2ALIGN_ENTRY(name, 4)
+
+
 #undef	END
 #define END(name)							      \
   cfi_endproc;								      \
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 9f02624375..75b6efe969 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -165,6 +165,32 @@
 # error Invalid LARGE_LOAD_SIZE
 #endif
 
+/* Whether to align before movsb. Ultimately we want 64 byte align
+   and not worth it to load 4x VEC for VEC_SIZE == 16.  */
+#define ALIGN_MOVSB	(VEC_SIZE	>	16)
+
+/* Number of VECs to align movsb to.  */
+#if VEC_SIZE == 64
+# define MOVSB_ALIGN_TO	(VEC_SIZE)
+#else
+# define MOVSB_ALIGN_TO	(VEC_SIZE	*	2)
+#endif
+
+/* Macro for copying inclusive power of 2 range with two register
+   loads.  */
+#define COPY_BLOCK(mov_inst, src_reg, dst_reg, size_reg, len, tmp_reg0, tmp_reg1)	\
+	mov_inst (%src_reg), %tmp_reg0; \
+	mov_inst -(len)(%src_reg, %size_reg), %tmp_reg1; \
+	mov_inst %tmp_reg0, (%dst_reg); \
+	mov_inst %tmp_reg1, -(len)(%dst_reg, %size_reg);
+
+/* Define all copies used by L(less_vec) for VEC_SIZE of 16, 32, or
+   64.  */
+#define COPY_4_8	COPY_BLOCK(movl, rsi, rdi, rdx, 4, ecx, esi)
+#define COPY_8_16	COPY_BLOCK(movq, rsi, rdi, rdx, 8, rcx, rsi)
+#define COPY_16_32	COPY_BLOCK(vmovdqu, rsi, rdi, rdx, 16, xmm0, xmm1)
+#define COPY_32_64	COPY_BLOCK(vmovdqu64, rsi, rdi, rdx, 32, ymm16, ymm17)
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -198,7 +224,13 @@ L(start):
 	movl	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
+	/* Based on SPEC2017 distribution both 16 and 32 memcpy calls are
+	   really hot so we want them to take the same branch path.  */
+#if VEC_SIZE > 16
+	jbe	L(less_vec)
+#else
 	jb	L(less_vec)
+#endif
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 #if !defined USE_MULTIARCH || !IS_IN (libc)
@@ -206,15 +238,10 @@ L(last_2x_vec):
 #endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-	ret
-#else
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
-#endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 
@@ -289,7 +316,9 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+/* Cache align entry so that branch heavy L(less_vec) maintains good
+   alignment.  */
+P2ALIGN_ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 	movq	%rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
@@ -297,123 +326,217 @@ L(start_erms):
 	movl	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
+	/* Based on SPEC2017 distribution both 16 and 32 memcpy calls are
+	   really hot so we want them to take the same branch path.  */
+# if VEC_SIZE > 16
+	jbe	L(less_vec)
+# else
 	jb	L(less_vec)
+# endif
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
 	ret
+# endif
 #endif
+#if VEC_SIZE == 64
+L(copy_8_15):
+	COPY_8_16
+	ret
 
-L(movsb):
-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-	jb	L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rdi, %rcx
-	subq	%rsi, %rcx
-	jmp	2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rsi, %rcx
-	subq	%rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
-   is N*4GB + [1..63] with N >= 0.  */
-	cmpl	$63, %ecx
-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
-3:
-# endif
-	mov	%RDX_LP, %RCX_LP
-	rep movsb
-L(nop):
+L(copy_33_63):
+	COPY_32_64
 	ret
 #endif
-
+	/* Only worth aligning if near end of 16 byte block and won't get
+	   first branch in first decode after jump.  */
+	.p2align 4,, 6
 L(less_vec):
-	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
-#if VEC_SIZE > 32
-	cmpb	$32, %dl
-	jae	L(between_32_63)
+	/* Second set of branches for smallest copies.  */
+	cmpl	$(VEC_SIZE / 4), %edx
+	jb	L(less_quarter_vec)
+
+	cmpl	$(VEC_SIZE / 2), %edx
+#if VEC_SIZE == 64
+	/* We branch to [33, 63] instead of [16, 32] to give [16, 32] fall
+	   through path as [16, 32] is hotter.  */
+	ja	L(copy_33_63)
+	COPY_16_32
+#elif VEC_SIZE == 32
+	/* Branch to [8, 15]. Fall through to [16, 32].  */
+	jb	L(copy_8_15)
+	COPY_16_32
+#else
+	/* Branch to [4, 7]. Fall through to [8, 15].  */
+	jb	L(copy_4_7)
+	COPY_8_16
 #endif
-#if VEC_SIZE > 16
-	cmpb	$16, %dl
-	jae	L(between_16_31)
-#endif
-	cmpb	$8, %dl
-	jae	L(between_8_15)
-	cmpb	$4, %dl
-	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
+	ret
+	/* Align if won't cost too many bytes.  */
+	.p2align 4,, 6
+L(copy_4_7):
+	COPY_4_8
+	ret
+
+	/* Cold target. No need to align.  */
+L(copy_1):
 	movzbl	(%rsi), %ecx
 	movb	%cl, (%rdi)
-1:
 	ret
+
+	/* Colder copy case for [0, VEC_SIZE / 4 - 1].  */
+L(less_quarter_vec):
 #if VEC_SIZE > 32
-L(between_32_63):
-	/* From 32 to 63.  No branch when size == 32.  */
-	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi,%rdx), %YMM1
-	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi,%rdx)
-	VZEROUPPER_RETURN
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
 #endif
 #if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
-L(between_16_31):
-	VMOVU	(%rsi), %XMM0
-	VMOVU	-16(%rsi,%rdx), %XMM1
-	VMOVU	%XMM0, (%rdi)
-	VMOVU	%XMM1, -16(%rdi,%rdx)
-	VZEROUPPER_RETURN
-#endif
-L(between_8_15):
-	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
-	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
-	movq	%rsi, (%rdi)
-	ret
-L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+#endif
+	cmpl	$1, %edx
+	je	L(copy_1)
+	jb	L(copy_0)
+	/* Fall through into copy [2, 3] as it is more common than [0, 1].
+	 */
+	movzwl	(%rsi), %ecx
+	movzbl	-1(%rsi, %rdx), %esi
+	movw	%cx, (%rdi)
+	movb	%sil, -1(%rdi, %rdx)
+L(copy_0):
 	ret
-L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
+
+	.p2align 4
+#if VEC_SIZE == 32
+L(copy_8_15):
+	COPY_8_16
 	ret
+	/* COPY_8_16 is exactly 17 bytes so don't want to p2align after as
+	   it wastes 15 bytes of code and 1 byte off is fine.  */
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+L(movsb):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward movsb is slow.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+	/* If above __x86_rep_movsb_stop_threshold most likely is candidate
+	   for NT moves aswell.  */
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	jae	L(large_memcpy_2x_check)
+# if ALIGN_MOVSB
+	VMOVU	(%rsi), %VEC(0)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* Store dst for use after rep movsb.  */
+	movq	%rdi, %r8
+# endif
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+	/* Only avoid short movsb if CPU has FSRM.  */
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	L(skip_short_movsb_check)
+	/* Avoid "rep movsb" if RCX, the distance between source and
+	   destination, is N*4GB + [1..63] with N >= 0.  */
+
+	/* ecx contains dst - src. Early check for backward copy conditions
+	   means only case of slow movsb with src = dst + [0, 63] is ecx in
+	   [-63, 0]. Use unsigned comparison with -64 check for that case.  */
+	cmpl	$-64, %ecx
+	ja	L(more_8x_vec_forward)
+# endif
+# if ALIGN_MOVSB
+	/* Fall through means cpu has FSRM. In that case exclusively align
+	   destination.  */
+
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Add dst to len. Subtract back after dst aligned.  */
+	leaq	(%rdi, %rdx), %rcx
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
+	andq	$-(MOVSB_ALIGN_TO), %rdi
+	/* Restore src and len adjusted with new values for aligned dst.  */
+	addq	%rdi, %rsi
+	subq	%rdi, %rcx
 
+	rep	movsb
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+L(movsb_align_dst):
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Add dst to len. Subtract back after dst aligned. -1 because dst
+	   is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-(1)(%rdi, %rdx), %rcx
+	/* Inclusively align dst to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rdi
+	leaq	1(%rdi, %rsi), %rsi
+	/* Restore src and len adjusted with new values for aligned dst.  */
+	subq	%rdi, %rcx
+	/* Finish aligning dst.  */
+	incq	%rdi
+	rep	movsb
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+
+L(skip_short_movsb_check):
+	/* If CPU does not have FSRM two options for aligning. Align src if
+	   dst and src 4k alias. Otherwise align dst.  */
+	testl	$(PAGE_SIZE - 512), %ecx
+	jnz	L(movsb_align_dst)
+	/* rcx already has dst - src.  */
+	movq	%rcx, %r9
+	/* Add src to len. Subtract back after src aligned. -1 because src
+	   is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-(1)(%rsi, %rdx), %rcx
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
+	/* Restore dst and len adjusted with new values for aligned dst.  */
+	leaq	1(%rsi, %r9), %rdi
+	subq	%rsi, %rcx
+	/* Finish aligning src.  */
+	incq	%rsi
+	rep	movsb
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# else
+	/* Not alignined rep movsb so just copy.  */
+	mov	%RDX_LP, %RCX_LP
+	rep	movsb
+	ret
+# endif
+#endif
+	/* Align if doesn't cost too many bytes.  */
+	.p2align 4,, 6
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
@@ -426,50 +549,60 @@ L(more_2x_vec):
 	ja	L(more_8x_vec)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
+	/* Align if doesn't cost too much code size. 6 bytes so that after
+	   jump to target a full mov instruction will always be able to be
+	   fetched.  */
+	.p2align 4,, 6
 L(last_4x_vec):
-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	/* Keep nop target close to jmp for 2-byte encoding.  */
+L(nop):
 	VZEROUPPER_RETURN
-
+	/* Align if doesn't cost too much code size.  */
+	.p2align 4,, 10
 L(more_8x_vec):
 	/* Check if non-temporal move candidate.  */
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_memcpy_2x)
 #endif
-	/* Entry if rdx is greater than non-temporal threshold but there
-       is overlap.  */
+	/* Entry if rdx is greater than non-temporal threshold but there is
+	   overlap.  */
 L(more_8x_vec_check):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
 	/* Source == destination is less common.  */
 	je	L(nop)
+	/* Entry if rdx is greater than movsb or stop movsb threshold but
+	   there is overlap with dst > src.  */
+L(more_8x_vec_forward):
 	/* Load the first VEC and last 4 * VEC to support overlapping
 	   addresses.  */
 	VMOVU	(%rsi), %VEC(4)
@@ -477,22 +610,18 @@ L(more_8x_vec_check):
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
-
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rdi, %rdx), %rdx
+	/* Save begining of dst.  */
+	movq	%rdi, %rcx
+	/* Align dst to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	/* Restore src adjusted with new value for aligned dst.  */
+	leaq	1(%rdi, %rsi), %rsi
+	/* Finish aligning dst.  */
+	incq	%rdi
 	.p2align 4
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
@@ -501,23 +630,27 @@ L(loop_4x_vec_forward):
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
 	subq	$-(VEC_SIZE * 4), %rsi
-	addq	$-(VEC_SIZE * 4), %rdx
 	VMOVA	%VEC(0), (%rdi)
 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
+	VMOVU	%VEC(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	%VEC(4), (%rcx)
+	/* Keep nop target close to jmp for 2-byte encoding.  */
+L(nop2):
 	VZEROUPPER_RETURN
-
+	/* Entry from fail movsb. Need to test if dst - src == 0 still.  */
+L(more_8x_vec_backward_check_nop):
+	testq	%rcx, %rcx
+	jz	L(nop2)
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
 	   addresses.  */
@@ -525,49 +658,50 @@ L(more_8x_vec_backward):
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
-
-	.p2align 4
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Save begining of buffer.  */
+	movq	%rdi, %rcx
+	/* Set dst to begining of region to copy. -1 for inclusive
+	   alignment.  */
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rdi
+	/* Align dst.  */
+	andq	$-(VEC_SIZE), %rdi
+	/* Restore src.  */
+	addq	%rdi, %rsi
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	addq	$-(VEC_SIZE * 4), %rcx
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	addq	$-(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(0)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * -4), %rsi
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VEC(1), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 0)(%rdi)
+	addq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rdi, %rcx
+	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), (%rcx)
+	VMOVU	%VEC(5), VEC_SIZE(%rcx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rcx)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rcx)
 	VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	.p2align 4
+	/* Entry if dst > stop movsb threshold (usually set to non-temporal
+	   threshold).  */
+L(large_memcpy_2x_check):
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_forward)
 L(large_memcpy_2x):
 	/* Compute absolute value of difference between source and
 	   destination.  */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S
  2021-08-24  8:27 ` [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S Noah Goldstein via Libc-alpha
@ 2021-08-24  9:12   ` Noah Goldstein via Libc-alpha
  0 siblings, 0 replies; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24  9:12 UTC (permalink / raw)
  To: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 30396 bytes --]

On Tue, Aug 24, 2021 at 4:29 AM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> No bug. This commit optimizes memmove-vec-unaligned.S.
>
> The optimizations are in descending order of importance to the
> L(less_vec), L(movsb), the 8x forward/backward loops and various
> target alignments that have minimal code size impact.
>
> The L(less_vec) optimizations are to:
>
>     1. Readjust the branch order to either given hotter paths a fall
>     through case or have less branches in there way.
>     2. Moderately change the size classes to make hot branches hotter
>     and thus increase predictability.
>     3. Try and minimize branch aliasing to avoid BPU thrashing based
>     misses.
>     4. 64 byte the prior function entry. This is to avoid cases where
>     seemingly unrelated changes end up have severe negative
>     performance impacts.
>
> The L(movsb) optimizations are to:
>
>     1. Reduce the number of taken branches needed to determine if
>     movsb should be used.
>     2. 64 byte align either dst if the CPU has fsrm or if dst and src
>     do not 4k alias.
>     3. 64 byte align src if the CPU does not have fsrm and dst and src
>     do 4k alias.
>
> The 8x forward/backward loop optimizations are to:
>
>     1. Reduce instructions needed for aligning to VEC_SIZE.
>     2. Reduce uops and code size of the loops.
>
> All tests in string/ passing.
> ---
> See performance data attached.
> Included benchmarks: memcpy-random, memcpy, memmove, memcpy-walk,
> memmove-walk, memcpy-large
>
> The first page is a summary with the ifunc selection version for
> erms/non-erms for each computers. Then in the following 4 sheets are
> all the numbers for sse2, avx for Skylake and sse2, avx2, evex, and
> avx512 for Tigerlake.
>
> Benchmark CPUS: Skylake:
>
> https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html
>
> Tigerlake:
>
> https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
>
> All times are geometric mean of N=30.
>
> "Cur" refers to the current implementation "New" refers to this
> patches implementation
>
> Score refers to new/cur (low means improvement, high means
> degragation). Scores are color coded. The more green the better, the
> more red the worse.
>
>
> Some notes on the numbers:
>
> In my opinion most of the benchmarks where src/dst align are in [0,
> 64] have some unpredictable and unfortunate noise from non-obvious
> false dependencies between stores to dst and next iterations loads
> from src. For example in the 8x forward case, the store of VEC(4) will
> end up stalling next iterations load queue, so if size was large
> enough that the begining of dst was flushed from L1 this can have a
> seemingly random but significant impact on the benchmark result.
>
> There are significant performance improvements/degregations in the [0,
> VEC_SIZE]. I didn't treat these as imporant as I think in this size
> range the branch pattern indicated by the random tests is more
> important. On the random tests the new implementation performance
> significantly better.
>
> I also added logic to align before L(movsb). If you see the new random
> benchmarks with fixed size this leads to roughly a 10-20% performance
> improvement for some hot sizes. I am not 100% convinced this is needed
> as generally for larger copies that would go to movsb they are already
> aligned but even in the fixed loop cases, especially on Skylake w.o
> FSRM it seems aligning before movsb pays off. Let me know if you think
> this is unnecessary.
>
> There are occasional performance degregations at odd splots throughout
> the medium range sizes in the fixed memcpy benchmarks. I think
> generally there is more good than harm here and at the moment I don't
> have an explination for why these certain configurations seem to
> perform worse. On the plus side, however, it also seems that there are
> unexplained improvements of the same magnitude patterened with the
> degregations (and both are sparse) so I ultimately believe it should
> be acceptable. if this is not the case let me know.
>
> The memmove benchmarks look a bit worse, especially for the erms
> case. Part of this is from the nop cases which I didn't treat as
> important. But part of it is also because to optimize for what I
> expect to be the common case of no overlap the overlap case has extra
> branches and overhead. I think this is inevitable when implementing
> memmove and memcpy in the same file, but if this is unacceptable let
> me know.
>
>
> Note: I benchmarks before two changes that made it into the final version:
>
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(nop):
> -       ret
> -#else
> +       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
>         VZEROUPPER_RETURN
> -#endif
>
>
>
> And
>
> +       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> -       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
>
>
> I don't think either of these should have any impact.
>
> I made the former change because I think it was a bug that could cause
> use of avx2 w.o vzeroupper and the latter because I think it could
> cause issues on multicore platforms.
>
>
>  sysdeps/x86/sysdep.h                          |  13 +-
>  .../multiarch/memmove-vec-unaligned-erms.S    | 484 +++++++++++-------
>  2 files changed, 317 insertions(+), 180 deletions(-)
>
> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index cac1d762fb..9226d2c6c9 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -78,15 +78,18 @@ enum cf_protection_level
>  #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
>
>  /* Define an entry point visible from C.  */
> -#define        ENTRY(name)
>            \
> -  .globl C_SYMBOL_NAME(name);
>     \
> -  .type C_SYMBOL_NAME(name),@function;
>    \
> -  .align ALIGNARG(4);
>     \
> +#define        P2ALIGN_ENTRY(name, alignment)
>                             \
> +  .globl C_SYMBOL_NAME(name);
>             \
> +  .type C_SYMBOL_NAME(name),@function;
>                    \
> +  .align ALIGNARG(alignment);
>             \
>    C_LABEL(name)
>             \
>    cfi_startproc;
>    \
> -  _CET_ENDBR;
>     \
> +  _CET_ENDBR;                                                        \
>    CALL_MCOUNT
>
> +#define        ENTRY(name) P2ALIGN_ENTRY(name, 4)
> +
> +
>  #undef END
>  #define END(name)
>     \
>    cfi_endproc;
>    \
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index 9f02624375..75b6efe969 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -165,6 +165,32 @@
>  # error Invalid LARGE_LOAD_SIZE
>  #endif
>
> +/* Whether to align before movsb. Ultimately we want 64 byte align
> +   and not worth it to load 4x VEC for VEC_SIZE == 16.  */
> +#define ALIGN_MOVSB    (VEC_SIZE       >       16)
> +
> +/* Number of VECs to align movsb to.  */
> +#if VEC_SIZE == 64
> +# define MOVSB_ALIGN_TO        (VEC_SIZE)
> +#else
> +# define MOVSB_ALIGN_TO        (VEC_SIZE       *       2)
> +#endif
> +
> +/* Macro for copying inclusive power of 2 range with two register
> +   loads.  */
> +#define COPY_BLOCK(mov_inst, src_reg, dst_reg, size_reg, len, tmp_reg0,
> tmp_reg1)      \
> +       mov_inst (%src_reg), %tmp_reg0; \
> +       mov_inst -(len)(%src_reg, %size_reg), %tmp_reg1; \
> +       mov_inst %tmp_reg0, (%dst_reg); \
> +       mov_inst %tmp_reg1, -(len)(%dst_reg, %size_reg);
> +
> +/* Define all copies used by L(less_vec) for VEC_SIZE of 16, 32, or
> +   64.  */
> +#define COPY_4_8       COPY_BLOCK(movl, rsi, rdi, rdx, 4, ecx, esi)
> +#define COPY_8_16      COPY_BLOCK(movq, rsi, rdi, rdx, 8, rcx, rsi)
> +#define COPY_16_32     COPY_BLOCK(vmovdqu, rsi, rdi, rdx, 16, xmm0, xmm1)
> +#define COPY_32_64     COPY_BLOCK(vmovdqu64, rsi, rdi, rdx, 32, ymm16,
> ymm17)
> +
>  #ifndef SECTION
>  # error SECTION is not defined!
>  #endif
> @@ -198,7 +224,13 @@ L(start):
>         movl    %edx, %edx
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
> +       /* Based on SPEC2017 distribution both 16 and 32 memcpy calls are
> +          really hot so we want them to take the same branch path.  */
> +#if VEC_SIZE > 16
> +       jbe     L(less_vec)
> +#else
>         jb      L(less_vec)
> +#endif
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(more_2x_vec)
>  #if !defined USE_MULTIARCH || !IS_IN (libc)
> @@ -206,15 +238,10 @@ L(last_2x_vec):
>  #endif
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
>         VMOVU   (%rsi), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
>         VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(nop):
> -       ret
> -#else
> +       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
>         VZEROUPPER_RETURN
> -#endif
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMMOVE_SYMBOL (__memmove, unaligned))
>
> @@ -289,7 +316,9 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk,
> unaligned_erms))
>  END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
>  # endif
>
> -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> +/* Cache align entry so that branch heavy L(less_vec) maintains good
> +   alignment.  */
> +P2ALIGN_ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
>         movq    %rdi, %rax
>  L(start_erms):
>  # ifdef __ILP32__
> @@ -297,123 +326,217 @@ L(start_erms):
>         movl    %edx, %edx
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
> +       /* Based on SPEC2017 distribution both 16 and 32 memcpy calls are
> +          really hot so we want them to take the same branch path.  */
> +# if VEC_SIZE > 16
> +       jbe     L(less_vec)
> +# else
>         jb      L(less_vec)
> +# endif
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(movsb_more_2x_vec)
>  L(last_2x_vec):
> -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
>         VMOVU   (%rsi), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
>         VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> +       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
>  L(return):
> -#if VEC_SIZE > 16
> +# if VEC_SIZE > 16
>         ZERO_UPPER_VEC_REGISTERS_RETURN
> -#else
> +# else
>         ret
> +# endif
>  #endif
> +#if VEC_SIZE == 64
> +L(copy_8_15):
> +       COPY_8_16
> +       ret
>
> -L(movsb):
> -       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> -       jae     L(more_8x_vec)
> -       cmpq    %rsi, %rdi
> -       jb      1f
> -       /* Source == destination is less common.  */
> -       je      L(nop)
> -       leaq    (%rsi,%rdx), %r9
> -       cmpq    %r9, %rdi
> -       /* Avoid slow backward REP MOVSB.  */
> -       jb      L(more_8x_vec_backward)
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> -       jz      3f
> -       movq    %rdi, %rcx
> -       subq    %rsi, %rcx
> -       jmp     2f
> -# endif
> -1:
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> -       jz      3f
> -       movq    %rsi, %rcx
> -       subq    %rdi, %rcx
> -2:
> -/* Avoid "rep movsb" if RCX, the distance between source and destination,
> -   is N*4GB + [1..63] with N >= 0.  */
> -       cmpl    $63, %ecx
> -       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> -3:
> -# endif
> -       mov     %RDX_LP, %RCX_LP
> -       rep movsb
> -L(nop):
> +L(copy_33_63):
> +       COPY_32_64
>         ret
>  #endif
> -
> +       /* Only worth aligning if near end of 16 byte block and won't get
> +          first branch in first decode after jump.  */
> +       .p2align 4,, 6
>  L(less_vec):
> -       /* Less than 1 VEC.  */
>  #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  # error Unsupported VEC_SIZE!
>  #endif
> -#if VEC_SIZE > 32
> -       cmpb    $32, %dl
> -       jae     L(between_32_63)
> +       /* Second set of branches for smallest copies.  */
> +       cmpl    $(VEC_SIZE / 4), %edx
> +       jb      L(less_quarter_vec)
> +
> +       cmpl    $(VEC_SIZE / 2), %edx
> +#if VEC_SIZE == 64
> +       /* We branch to [33, 63] instead of [16, 32] to give [16, 32] fall
> +          through path as [16, 32] is hotter.  */
> +       ja      L(copy_33_63)
> +       COPY_16_32
> +#elif VEC_SIZE == 32
> +       /* Branch to [8, 15]. Fall through to [16, 32].  */
> +       jb      L(copy_8_15)
> +       COPY_16_32
> +#else
> +       /* Branch to [4, 7]. Fall through to [8, 15].  */
> +       jb      L(copy_4_7)
> +       COPY_8_16
>  #endif
> -#if VEC_SIZE > 16
> -       cmpb    $16, %dl
> -       jae     L(between_16_31)
> -#endif
> -       cmpb    $8, %dl
> -       jae     L(between_8_15)
> -       cmpb    $4, %dl
> -       jae     L(between_4_7)
> -       cmpb    $1, %dl
> -       ja      L(between_2_3)
> -       jb      1f
> +       ret
> +       /* Align if won't cost too many bytes.  */
> +       .p2align 4,, 6
> +L(copy_4_7):
> +       COPY_4_8
> +       ret
> +
> +       /* Cold target. No need to align.  */
> +L(copy_1):
>         movzbl  (%rsi), %ecx
>         movb    %cl, (%rdi)
> -1:
>         ret
> +
> +       /* Colder copy case for [0, VEC_SIZE / 4 - 1].  */
> +L(less_quarter_vec):
>  #if VEC_SIZE > 32
> -L(between_32_63):
> -       /* From 32 to 63.  No branch when size == 32.  */
> -       VMOVU   (%rsi), %YMM0
> -       VMOVU   -32(%rsi,%rdx), %YMM1
> -       VMOVU   %YMM0, (%rdi)
> -       VMOVU   %YMM1, -32(%rdi,%rdx)
> -       VZEROUPPER_RETURN
> +       cmpl    $8, %edx
> +       jae     L(copy_8_15)
>  #endif
>  #if VEC_SIZE > 16
> -       /* From 16 to 31.  No branch when size == 16.  */
> -L(between_16_31):
> -       VMOVU   (%rsi), %XMM0
> -       VMOVU   -16(%rsi,%rdx), %XMM1
> -       VMOVU   %XMM0, (%rdi)
> -       VMOVU   %XMM1, -16(%rdi,%rdx)
> -       VZEROUPPER_RETURN
> -#endif
> -L(between_8_15):
> -       /* From 8 to 15.  No branch when size == 8.  */
> -       movq    -8(%rsi,%rdx), %rcx
> -       movq    (%rsi), %rsi
> -       movq    %rcx, -8(%rdi,%rdx)
> -       movq    %rsi, (%rdi)
> -       ret
> -L(between_4_7):
> -       /* From 4 to 7.  No branch when size == 4.  */
> -       movl    -4(%rsi,%rdx), %ecx
> -       movl    (%rsi), %esi
> -       movl    %ecx, -4(%rdi,%rdx)
> -       movl    %esi, (%rdi)
> +       cmpl    $4, %edx
> +       jae     L(copy_4_7)
> +#endif
> +       cmpl    $1, %edx
> +       je      L(copy_1)
> +       jb      L(copy_0)
> +       /* Fall through into copy [2, 3] as it is more common than [0, 1].
> +        */
> +       movzwl  (%rsi), %ecx
> +       movzbl  -1(%rsi, %rdx), %esi
> +       movw    %cx, (%rdi)
> +       movb    %sil, -1(%rdi, %rdx)
> +L(copy_0):
>         ret
> -L(between_2_3):
> -       /* From 2 to 3.  No branch when size == 2.  */
> -       movzwl  -2(%rsi,%rdx), %ecx
> -       movzwl  (%rsi), %esi
> -       movw    %cx, -2(%rdi,%rdx)
> -       movw    %si, (%rdi)
> +
> +       .p2align 4
> +#if VEC_SIZE == 32
> +L(copy_8_15):
> +       COPY_8_16
>         ret
> +       /* COPY_8_16 is exactly 17 bytes so don't want to p2align after as
> +          it wastes 15 bytes of code and 1 byte off is fine.  */
> +#endif
> +
> +#if defined USE_MULTIARCH && IS_IN (libc)
> +L(movsb):
> +       movq    %rdi, %rcx
> +       subq    %rsi, %rcx
> +       /* Go to backwards temporal copy if overlap no matter what as
> +          backward movsb is slow.  */
> +       cmpq    %rdx, %rcx
> +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> +       jb      L(more_8x_vec_backward_check_nop)
> +       /* If above __x86_rep_movsb_stop_threshold most likely is candidate
> +          for NT moves aswell.  */
> +       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> +       jae     L(large_memcpy_2x_check)
> +# if ALIGN_MOVSB
> +       VMOVU   (%rsi), %VEC(0)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  endif
> +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +#   error Unsupported MOVSB_ALIGN_TO
> +#  endif
> +       /* Store dst for use after rep movsb.  */
> +       movq    %rdi, %r8
> +# endif
> +# if AVOID_SHORT_DISTANCE_REP_MOVSB
> +       /* Only avoid short movsb if CPU has FSRM.  */
> +       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> +       jz      L(skip_short_movsb_check)
> +       /* Avoid "rep movsb" if RCX, the distance between source and
> +          destination, is N*4GB + [1..63] with N >= 0.  */
> +
> +       /* ecx contains dst - src. Early check for backward copy conditions
> +          means only case of slow movsb with src = dst + [0, 63] is ecx in
> +          [-63, 0]. Use unsigned comparison with -64 check for that
> case.  */
> +       cmpl    $-64, %ecx
> +       ja      L(more_8x_vec_forward)
> +# endif
> +# if ALIGN_MOVSB
> +       /* Fall through means cpu has FSRM. In that case exclusively align
> +          destination.  */
> +
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Add dst to len. Subtract back after dst aligned.  */
> +       leaq    (%rdi, %rdx), %rcx
> +       /* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
> +       addq    $(MOVSB_ALIGN_TO - 1), %rdi
> +       andq    $-(MOVSB_ALIGN_TO), %rdi
> +       /* Restore src and len adjusted with new values for aligned dst.
> */
> +       addq    %rdi, %rsi
> +       subq    %rdi, %rcx
>
> +       rep     movsb
> +       VMOVU   %VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +#  endif
> +       VZEROUPPER_RETURN
> +L(movsb_align_dst):
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Add dst to len. Subtract back after dst aligned. -1 because dst
> +          is initially aligned to MOVSB_ALIGN_TO - 1.  */
> +       leaq    -(1)(%rdi, %rdx), %rcx
> +       /* Inclusively align dst to MOVSB_ALIGN_TO - 1.  */
> +       orq     $(MOVSB_ALIGN_TO - 1), %rdi
> +       leaq    1(%rdi, %rsi), %rsi
> +       /* Restore src and len adjusted with new values for aligned dst.
> */
> +       subq    %rdi, %rcx
> +       /* Finish aligning dst.  */
> +       incq    %rdi
> +       rep     movsb
> +       VMOVU   %VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +#  endif
> +       VZEROUPPER_RETURN
> +
> +L(skip_short_movsb_check):
> +       /* If CPU does not have FSRM two options for aligning. Align src if
> +          dst and src 4k alias. Otherwise align dst.  */
> +       testl   $(PAGE_SIZE - 512), %ecx
> +       jnz     L(movsb_align_dst)
> +       /* rcx already has dst - src.  */
> +       movq    %rcx, %r9
> +       /* Add src to len. Subtract back after src aligned. -1 because src
> +          is initially aligned to MOVSB_ALIGN_TO - 1.  */
> +       leaq    -(1)(%rsi, %rdx), %rcx
> +       /* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
> +       orq     $(MOVSB_ALIGN_TO - 1), %rsi
> +       /* Restore dst and len adjusted with new values for aligned dst.
> */
> +       leaq    1(%rsi, %r9), %rdi
> +       subq    %rsi, %rcx
> +       /* Finish aligning src.  */
> +       incq    %rsi
> +       rep     movsb
> +       VMOVU   %VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +#  endif
> +       VZEROUPPER_RETURN
> +# else
> +       /* Not alignined rep movsb so just copy.  */
> +       mov     %RDX_LP, %RCX_LP
> +       rep     movsb
> +       ret
> +# endif
> +#endif
> +       /* Align if doesn't cost too many bytes.  */
> +       .p2align 4,, 6
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  L(movsb_more_2x_vec):
>         cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
> @@ -426,50 +549,60 @@ L(more_2x_vec):
>         ja      L(more_8x_vec)
>         cmpq    $(VEC_SIZE * 4), %rdx
>         jbe     L(last_4x_vec)
> -       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> +       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
>         VMOVU   (%rsi), %VEC(0)
>         VMOVU   VEC_SIZE(%rsi), %VEC(1)
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
> -       VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(4)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
>         VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(1), VEC_SIZE(%rdi)
>         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
>         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> -       VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
> -       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
> +       VMOVU   %VEC(4), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
>         VZEROUPPER_RETURN
> +       /* Align if doesn't cost too much code size. 6 bytes so that after
> +          jump to target a full mov instruction will always be able to be
> +          fetched.  */
> +       .p2align 4,, 6
>  L(last_4x_vec):
> -       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> +       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
>         VMOVU   (%rsi), %VEC(0)
>         VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(2)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
>         VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
> +       VMOVU   %VEC(2), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       /* Keep nop target close to jmp for 2-byte encoding.  */
> +L(nop):
>         VZEROUPPER_RETURN
> -
> +       /* Align if doesn't cost too much code size.  */
> +       .p2align 4,, 10
>  L(more_8x_vec):
>         /* Check if non-temporal move candidate.  */
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
>         /* Check non-temporal store threshold.  */
> -       cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
>         ja      L(large_memcpy_2x)
>  #endif
> -       /* Entry if rdx is greater than non-temporal threshold but there
> -       is overlap.  */
> +       /* Entry if rdx is greater than non-temporal threshold but there is
> +          overlap.  */
>  L(more_8x_vec_check):
>         cmpq    %rsi, %rdi
>         ja      L(more_8x_vec_backward)
>         /* Source == destination is less common.  */
>         je      L(nop)
> +       /* Entry if rdx is greater than movsb or stop movsb threshold but
> +          there is overlap with dst > src.  */
> +L(more_8x_vec_forward):
>         /* Load the first VEC and last 4 * VEC to support overlapping
>            addresses.  */
>         VMOVU   (%rsi), %VEC(4)
> @@ -477,22 +610,18 @@ L(more_8x_vec_check):
>         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
>         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
>         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> -       /* Save start and stop of the destination buffer.  */
> -       movq    %rdi, %r11
> -       leaq    -VEC_SIZE(%rdi, %rdx), %rcx
> -       /* Align destination for aligned stores in the loop.  Compute
> -          how much destination is misaligned.  */
> -       movq    %rdi, %r8
> -       andq    $(VEC_SIZE - 1), %r8
> -       /* Get the negative of offset for alignment.  */
> -       subq    $VEC_SIZE, %r8
> -       /* Adjust source.  */
> -       subq    %r8, %rsi
> -       /* Adjust destination which should be aligned now.  */
> -       subq    %r8, %rdi
> -       /* Adjust length.  */
> -       addq    %r8, %rdx
> -
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Store end of buffer minus tail in rdx.  */
> +       leaq    (VEC_SIZE * -4)(%rdi, %rdx), %rdx
> +       /* Save begining of dst.  */
> +       movq    %rdi, %rcx
> +       /* Align dst to VEC_SIZE - 1.  */
> +       orq     $(VEC_SIZE - 1), %rdi
> +       /* Restore src adjusted with new value for aligned dst.  */
> +       leaq    1(%rdi, %rsi), %rsi
> +       /* Finish aligning dst.  */
> +       incq    %rdi
>         .p2align 4
>  L(loop_4x_vec_forward):
>         /* Copy 4 * VEC a time forward.  */
> @@ -501,23 +630,27 @@ L(loop_4x_vec_forward):
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
>         subq    $-(VEC_SIZE * 4), %rsi
> -       addq    $-(VEC_SIZE * 4), %rdx
>         VMOVA   %VEC(0), (%rdi)
>         VMOVA   %VEC(1), VEC_SIZE(%rdi)
>         VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
>         VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
> -       cmpq    $(VEC_SIZE * 4), %rdx
> +       cmpq    %rdi, %rdx
>         ja      L(loop_4x_vec_forward)
>         /* Store the last 4 * VEC.  */
> -       VMOVU   %VEC(5), (%rcx)
> -       VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> -       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> +       VMOVU   %VEC(5), (VEC_SIZE * 3)(%rdx)
> +       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdx)
> +       VMOVU   %VEC(7), VEC_SIZE(%rdx)
> +       VMOVU   %VEC(8), (%rdx)
>         /* Store the first VEC.  */
> -       VMOVU   %VEC(4), (%r11)
> +       VMOVU   %VEC(4), (%rcx)
> +       /* Keep nop target close to jmp for 2-byte encoding.  */
> +L(nop2):
>         VZEROUPPER_RETURN
> -
> +       /* Entry from fail movsb. Need to test if dst - src == 0 still.  */
> +L(more_8x_vec_backward_check_nop):
> +       testq   %rcx, %rcx
> +       jz      L(nop2)
>  L(more_8x_vec_backward):
>         /* Load the first 4 * VEC and last VEC to support overlapping
>            addresses.  */
> @@ -525,49 +658,50 @@ L(more_8x_vec_backward):
>         VMOVU   VEC_SIZE(%rsi), %VEC(5)
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
> -       /* Save stop of the destination buffer.  */
> -       leaq    -VEC_SIZE(%rdi, %rdx), %r11
> -       /* Align destination end for aligned stores in the loop.  Compute
> -          how much destination end is misaligned.  */
> -       leaq    -VEC_SIZE(%rsi, %rdx), %rcx
> -       movq    %r11, %r9
> -       movq    %r11, %r8
> -       andq    $(VEC_SIZE - 1), %r8
> -       /* Adjust source.  */
> -       subq    %r8, %rcx
> -       /* Adjust the end of destination which should be aligned now.  */
> -       subq    %r8, %r9
> -       /* Adjust length.  */
> -       subq    %r8, %rdx
> -
> -       .p2align 4
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(8)
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Save begining of buffer.  */
> +       movq    %rdi, %rcx
> +       /* Set dst to begining of region to copy. -1 for inclusive
> +          alignment.  */
> +       leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rdi
> +       /* Align dst.  */
> +       andq    $-(VEC_SIZE), %rdi
> +       /* Restore src.  */
> +       addq    %rdi, %rsi
> +       /* Don't use multi-byte nop to align.  */
> +       .p2align 4,, 11
>  L(loop_4x_vec_backward):
>         /* Copy 4 * VEC a time backward.  */
> -       VMOVU   (%rcx), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> -       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> -       addq    $-(VEC_SIZE * 4), %rcx
> -       addq    $-(VEC_SIZE * 4), %rdx
> -       VMOVA   %VEC(0), (%r9)
> -       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> -       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> -       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> -       addq    $-(VEC_SIZE * 4), %r9
> -       cmpq    $(VEC_SIZE * 4), %rdx
> -       ja      L(loop_4x_vec_backward)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(0)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(1)
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %VEC(3)
> +       addq    $(VEC_SIZE * -4), %rsi
> +       VMOVA   %VEC(0), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VEC(1), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VEC(2), (VEC_SIZE * 1)(%rdi)
> +       VMOVA   %VEC(3), (VEC_SIZE * 0)(%rdi)
> +       addq    $(VEC_SIZE * -4), %rdi
> +       cmpq    %rdi, %rcx
> +       jb      L(loop_4x_vec_backward)
>         /* Store the first 4 * VEC.  */
> -       VMOVU   %VEC(4), (%rdi)
> -       VMOVU   %VEC(5), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> +       VMOVU   %VEC(4), (%rcx)
> +       VMOVU   %VEC(5), VEC_SIZE(%rcx)
> +       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rcx)
> +       VMOVU   %VEC(7), (VEC_SIZE * 3)(%rcx)
>         /* Store the last VEC.  */
> -       VMOVU   %VEC(8), (%r11)
> +       VMOVU   %VEC(8), -VEC_SIZE(%rdx, %rcx)
>         VZEROUPPER_RETURN
>
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
>         .p2align 4
> +       /* Entry if dst > stop movsb threshold (usually set to non-temporal
> +          threshold).  */
> +L(large_memcpy_2x_check):
> +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +       jb      L(more_8x_vec_forward)
>  L(large_memcpy_2x):
>         /* Compute absolute value of difference between source and
>            destination.  */
> --
> 2.25.1
>
>

[-- Attachment #2: skl-tgl-summary.tar.gz --]
[-- Type: application/gzip, Size: 932674 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] string: Make tests birdirectional test-memcpy.c
  2021-08-24  8:27 [PATCH 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein via Libc-alpha
                   ` (3 preceding siblings ...)
  2021-08-24  8:27 ` [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S Noah Goldstein via Libc-alpha
@ 2021-08-24 15:17 ` H.J. Lu via Libc-alpha
  2021-08-24 19:32 ` [PATCH v1 " Noah Goldstein via Libc-alpha
  5 siblings, 0 replies; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-08-24 15:17 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Tue, Aug 24, 2021 at 1:28 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit updates the memcpy tests to test both dst > src and dst <
> src. This is because there is logic in the code based on the
> condition.
> ---
>  string/test-memcpy.c  | 125 +++++++++++++++++++++++++++++++++---------
>  string/test-memmove.c |  73 +++++++++++++++++++++++-
>  2 files changed, 170 insertions(+), 28 deletions(-)
>
> diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> index c9dfc88fed..705d79ba13 100644
> --- a/string/test-memcpy.c
> +++ b/string/test-memcpy.c
> @@ -79,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
>  static void
>  do_test (size_t align1, size_t align2, size_t len)
>  {
> -  size_t i, j;
> +  size_t i, j, repeats;
>    char *s1, *s2;
>
>    align1 &= 4095;
> @@ -92,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
>
>    s1 = (char *) (buf1 + align1);
>    s2 = (char *) (buf2 + align2);
> +  for (repeats = 0; repeats < 2; ++repeats)
> +    {
> +      for (i = 0, j = 1; i < len; i++, j += 23)
> +        s1[i] = j;
>
> -  for (i = 0, j = 1; i < len; i++, j += 23)
> -    s1[i] = j;
> -
> -  FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s2, s1, len);
> +      FOR_EACH_IMPL (impl, 0)
> +        do_one_test (impl, s2, s1, len);
> +    }
>  }
>
>  static void
> @@ -213,56 +215,88 @@ do_random_tests (void)
>  }
>
>  static void
> -do_test1 (size_t size)
> +do_test1 (size_t align1, size_t align2, size_t size)
>  {
>    void *large_buf;
> -  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> -                   MAP_PRIVATE | MAP_ANON, -1, 0);
> +  size_t mmap_size, region_size;
> +
> +  align1 &= (page_size - 1);
> +  if (align1 == 0)
> +    align1 = page_size;
> +
> +  align2 &= (page_size - 1);
> +  if (align2 == 0)
> +    align2 = page_size;
> +
> +  region_size = (size + page_size - 1) & (~(page_size - 1));
> +
> +  mmap_size = region_size * 2 + 3 * page_size;
> +  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> +                   MAP_PRIVATE | MAP_ANON, -1, 0);
>    if (large_buf == MAP_FAILED)
>      {
> -      puts ("Failed to allocat large_buf, skipping do_test1");
> +      puts ("Failed to allocate large_buf, skipping do_test1");
>        return;
>      }
> -
> -  if (mprotect (large_buf + size, page_size, PROT_NONE))
> +  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
>      error (EXIT_FAILURE, errno, "mprotect failed");
>
> -  size_t arrary_size = size / sizeof (uint32_t);
> -  uint32_t *dest = large_buf;
> -  uint32_t *src = large_buf + size + page_size;
> +  size_t array_size = size / sizeof (uint32_t);
> +  uint32_t *dest = large_buf + align1;
> +  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
>    size_t i;
>    size_t repeats;
>    for(repeats = 0; repeats < 2; repeats++)
>      {
> -      for (i = 0; i < arrary_size; i++)
> +      for (i = 0; i < array_size; i++)
>          src[i] = (uint32_t) i;
> -
>        FOR_EACH_IMPL (impl, 0)
>          {
> -            printf ("\t\tRunning: %s\n", impl->name);
> +            //            printf ("\t\tRunning: %s\n", impl->name);
>            memset (dest, -1, size);
>            CALL (impl, (char *) dest, (char *) src, size);
> -          for (i = 0; i < arrary_size; i++)
> +          for (i = 0; i < array_size; i++)
>          if (dest[i] != src[i])
>            {
>              error (0, 0,
>                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
>                 impl->name, dest, src, i);
>              ret = 1;
> -            munmap ((void *) large_buf, size * 2 + page_size);
> +            munmap ((void *) large_buf, mmap_size);
>              return;
>            }
>          }
> -      dest = src;
> -      src = large_buf;
> +      dest = large_buf + region_size + 2 * page_size + align1;
> +      src = large_buf + align2;
> +    }
> +  munmap ((void *) large_buf, mmap_size);
> +}
> +
> +static void
> +do_random_large_tests (void)
> +{
> +  size_t i, align1, align2, size;
> +  for (i = 0; i < 32; ++i)
> +    {
> +      align1 = random ();
> +      align2 = random ();
> +      size = (random() % 0x1000000) + 0x200000;
> +      do_test1 (align1, align2, size);
> +    }
> +
> +  for (i = 0; i < 128; ++i)
> +    {
> +      align1 = random ();
> +      align2 = random ();
> +      size = (random() % 32768) + 4096;
> +      do_test1 (align1, align2, size);
>      }
> -  munmap ((void *) large_buf, size * 2 + page_size);
>  }
>
>  int
>  test_main (void)
>  {
> -  size_t i;
> +  size_t i, j;
>
>    test_init ();
>
> @@ -299,6 +333,7 @@ test_main (void)
>    for (i = 19; i <= 25; ++i)
>      {
>        do_test (255, 0, 1 << i);
> +      do_test (0, 4000, 1 << i);
>        do_test (0, 255, i);
>        do_test (0, 4000, i);
>      }
> @@ -307,8 +342,46 @@ test_main (void)
>
>    do_random_tests ();
>
> -  do_test1 (0x100000);
> -  do_test1 (0x2000000);
> +  do_test1 (0, 0, 0x100000);
> +  do_test1 (0, 0, 0x2000000);
> +
> +  for (i = 4096; i < 32768; i += 4096)
> +    {
> +      for (j = 1; j <= 1024; j <<= 1)
> +        {
> +          do_test1 (0, j, i);
> +          do_test1 (4095, j, i);
> +          do_test1 (4096 - j, 0, i);
> +
> +          do_test1 (0, j - 1, i);
> +          do_test1 (4095, j - 1, i);
> +          do_test1 (4096 - j - 1, 0, i);
> +
> +          do_test1 (0, j + 1, i);
> +          do_test1 (4095, j + 1, i);
> +          do_test1 (4096 - j, 1, i);
> +        }
> +    }
> +
> +  for (i = 0x300000; i < 0x2000000; i += 0x235689)
> +    {
> +      for (j = 64; j <= 1024; j <<= 1)
> +        {
> +          do_test1 (0, j, i);
> +          do_test1 (4095, j, i);
> +          do_test1 (4096 - j, 0, i);
> +
> +          do_test1 (0, j - 1, i);
> +          do_test1 (4095, j - 1, i);
> +          do_test1 (4096 - j - 1, 0, i);
> +
> +          do_test1 (0, j + 1, i);
> +          do_test1 (4095, j + 1, i);
> +          do_test1 (4096 - j, 1, i);
> +        }
> +    }
> +
> +  do_random_large_tests ();
>    return ret;
>  }
>
> diff --git a/string/test-memmove.c b/string/test-memmove.c
> index 670094c9dc..5ba79acf61 100644
> --- a/string/test-memmove.c
> +++ b/string/test-memmove.c
> @@ -101,11 +101,11 @@ do_test (size_t align1, size_t align2, size_t len)
>    size_t i, j;
>    char *s1, *s2;
>
> -  align1 &= 63;
> +  align1 &= (getpagesize() - 1);
>    if (align1 + len >= page_size)
>      return;
>
> -  align2 &= 63;
> +  align2 &= (getpagesize() - 1);
>    if (align2 + len >= page_size)
>      return;
>
> @@ -356,6 +356,51 @@ do_test3 (size_t bytes_move, size_t offset)
>    munmap ((void *) buf, size);
>  }
>
> +static void
> +do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
> +{
> +  size_t size, repeats, i;
> +  uint8_t *buf, *dst, *src;
> +
> +  size = bytes_move + MAX(offset1, offset2);
> +  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
> +             MAP_PRIVATE | MAP_ANON, -1, 0);
> +
> +  if (buf == MAP_FAILED)
> +    error (EXIT_UNSUPPORTED, errno, "mmap failed");
> +
> +  dst = &buf[offset1];
> +  src = &buf[offset2];
> +  for (repeats = 0; repeats < 2; ++repeats)
> +    {
> +      FOR_EACH_IMPL (impl, 0)
> +        {
> +          for (i = 0; i < bytes_move; i++)
> +              src[i] = (uint8_t) i;
> +#ifdef TEST_BCOPY
> +          CALL (impl, (char *) src, (char *) dst, bytes_move);
> +#else
> +          CALL (impl, (char *) dst, (char *) src, bytes_move);
> +#endif
> +          for (i = 0; i < bytes_move; i++)
> +            {
> +              if (dst[i] != (uint8_t) i)
> +                {
> +                  error (0, 0,
> +                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> +                         impl->name, dst, buf, i);
> +                  ret = 1;
> +                  break;
> +                }
> +            }
> +        }
> +      dst = &buf[offset2];
> +      src = &buf[offset1];
> +    }
> +  munmap ((void *) buf, size);
> +}
> +
> +
>  int
>  test_main (void)
>  {
> @@ -396,13 +441,37 @@ test_main (void)
>
>    do_random_tests ();
>
> +  do_test2 (0);
>    do_test2 (33);
>    do_test2 (0x200000);
> +  do_test2 (0x200000 - 1);
> +  do_test2 (0x200000 + 1);
> +  do_test2 (0x1000000 + 1);
>    do_test2 (0x4000000 - 1);
>    do_test2 (0x4000000);
>
> +
>    /* Copy 16KB data.  */
>    do_test3 (16384, 3);
> +  for (i = 4096; i <= 16384; i <<= 1)
> +    {
> +      do_test4 (i, 0, i);
> +      do_test4 (i, 0, i - 1);
> +      do_test4 (i, 0, i + 1);
> +      do_test4 (i, 63, i + 63);
> +      do_test4 (i, 63, i + 64);
> +      do_test4 (i, 63, i);
> +
> +      do_test4 (i, 0, 1);
> +      do_test4 (i, 0, 15);
> +      do_test4 (i, 0, 31);
> +      do_test4 (i, 0, 63);
> +      do_test4 (i, 0, 64);
> +      do_test4 (i, 0, 65);
> +      do_test4 (i, 0, 127);
> +      do_test4 (i, 0, 129);
> +    }
> +
>
>    return ret;
>  }
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c
  2021-08-24  8:27 ` [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
@ 2021-08-24 15:18   ` H.J. Lu via Libc-alpha
  0 siblings, 0 replies; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-08-24 15:18 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Tue, Aug 24, 2021 at 1:28 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds three new benchmarks for the SPEC2017
> distribution. One randomized if dst > src and the other two set it
> either 1/0.
>
> As well add some tests for fixed sizes with randomize alignment and
> value of dst > src. This can be useful for testing different alignment
> configurations.
> ---
>  benchtests/bench-memcpy-random.c | 107 +++++++++++++++++++++++++++----
>  1 file changed, 96 insertions(+), 11 deletions(-)
>
> diff --git a/benchtests/bench-memcpy-random.c b/benchtests/bench-memcpy-random.c
> index c490b73ed0..28e0acb05f 100644
> --- a/benchtests/bench-memcpy-random.c
> +++ b/benchtests/bench-memcpy-random.c
> @@ -16,7 +16,8 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#define MIN_PAGE_SIZE (512*1024+getpagesize())
> +#define MAX_TEST_SIZE (512*1024)
> +#define MIN_PAGE_SIZE (3*MAX_TEST_SIZE+3*getpagesize())
>  #define TEST_MAIN
>  #define TEST_NAME "memcpy"
>  #include "bench-string.h"
> @@ -89,9 +90,12 @@ static align_data_t dst_align_freq[] =
>
>  typedef struct
>  {
> -  uint64_t src : 24;
> -  uint64_t dst : 24;
> -  uint64_t len : 16;
> +/* 26 bits for src and dst so we have extra bit for alternating dst >
> +   src without a branch.  */
> +  uint64_t src : 26;
> +  uint64_t dst : 26;
> +  /* For size < 4096 12 bits is enough.  */
> +  uint64_t len : 12;
>  } copy_t;
>
>  static copy_t copy[MAX_COPIES];
> @@ -142,34 +146,100 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
>  }
>
>  static void
> -do_test (json_ctx_t *json_ctx, size_t max_size)
> +do_one_fixed_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
> +               copy_t *copy, size_t n, size_t size)
>  {
> -  int i;
> +  timing_t start, stop, cur;
> +  size_t iters = INNER_LOOP_ITERS_SMALL;
>
> -  memset (buf1, 1, max_size);
> +  for (int j = 0; j < n; j++)
> +    CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
>
> -  /* Create a random set of copies with the given size and alignment
> +  TIMING_NOW (start);
> +  for (int i = 0; i < iters; ++i)
> +    for (int j = 0; j < n; j++)
> +      CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  json_element_double (json_ctx, (double) cur / (double) iters);
> +}
> +
> +
> +static size_t
> +init_copy(size_t max_size, int dst_gt_src)
> +{
> +  size_t i, dst_offset, src_offset;
> +  if (dst_gt_src <= 0)
> +    {
> +      dst_offset = 0;
> +      src_offset = MAX_TEST_SIZE + getpagesize();
> +    }
> +  else
> +    {
> +      dst_offset = MAX_TEST_SIZE + getpagesize();
> +      src_offset = 0;
> +    }
> +
> +    /* Create a random set of copies with the given size and alignment
>       distributions.  */
>    for (i = 0; i < MAX_COPIES; i++)
>      {
> +      dst_offset  = dst_gt_src == -1
> +                        ? (rand() & 1) ? MAX_TEST_SIZE + getpagesize() : 0
> +                        : dst_offset;
>        copy[i].dst = (rand () & (max_size - 1));
>        copy[i].dst &= ~dst_align_arr[rand () & ALIGN_MASK];
> +      copy[i].dst += dst_offset;
>        copy[i].src = (rand () & (max_size - 1));
>        copy[i].src &= ~src_align_arr[rand () & ALIGN_MASK];
> +      copy[i].src += src_offset;
>        copy[i].len = size_arr[rand () & SIZE_MASK];
>      }
> +  return i;
> +}
>
> +static void
> +do_test (json_ctx_t *json_ctx, size_t max_size, int dst_gt_src)
> +{
> +  size_t n;
> +  memset (buf1, 1, max_size);
> +  n = init_copy(max_size, dst_gt_src);
>    json_element_object_begin (json_ctx);
> -  json_attr_uint (json_ctx, "length", (double) max_size);
> +  json_attr_uint (json_ctx, "max-alignment", (double) max_size);
> +  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
> +  json_attr_uint (json_ctx, "with-fixed-size", (double) 0);
>    json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, i);
> +    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, n);
>
>    json_array_end (json_ctx);
>    json_element_object_end (json_ctx);
>  }
>
> +static void
> +do_test_fixed_size (json_ctx_t *json_ctx, size_t size, size_t max_size, int dst_gt_src)
> +{
> +  size_t n;
> +  memset (buf1, 1, max_size);
> +  n = init_copy(max_size, dst_gt_src);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "max-alignment", (double) max_size);
> +  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
> +  json_attr_uint (json_ctx, "with-fixed-size", (double) 1);
> +  json_attr_uint (json_ctx, "size", (double) size);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    do_one_fixed_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, n, size);
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +
>  int
>  test_main (void)
>  {
> @@ -194,7 +264,22 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>    for (int i = 4; i <= 512; i = i * 2)
> -    do_test (&json_ctx, i * 1024);
> +    {
> +      if (i * 1024 > MAX_TEST_SIZE)
> +          continue;
> +      do_test (&json_ctx, i * 1024, 0);
> +      do_test (&json_ctx, i * 1024, 1);
> +      do_test (&json_ctx, i * 1024, -1);
> +    }
> +
> +  for (int i = 4; i <= 64; i = i * 2)
> +    {
> +      if (i * 1024 > MAX_TEST_SIZE)
> +          continue;
> +      do_test_fixed_size (&json_ctx, i * 256, i * 1024, 0);
> +      do_test_fixed_size (&json_ctx, i * 256, i * 1024, 1);
> +      do_test_fixed_size (&json_ctx, i * 256, i * 1024, -1);
> +    }
>
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c
  2021-08-24  8:27 ` [PATCH 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein via Libc-alpha
@ 2021-08-24 15:18   ` H.J. Lu via Libc-alpha
  0 siblings, 0 replies; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-08-24 15:18 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Tue, Aug 24, 2021 at 1:28 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds a new partial overlap benchmark. This is generally
> the most interesting performance case for memmove and was missing.
> ---
>  benchtests/bench-memmove-walk.c | 67 ++++++++++++++++++++++++---------
>  1 file changed, 49 insertions(+), 18 deletions(-)
>
> diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c
> index b5fdb2a422..18b716f5cb 100644
> --- a/benchtests/bench-memmove-walk.c
> +++ b/benchtests/bench-memmove-walk.c
> @@ -36,6 +36,10 @@
>  # define TIMEOUT (20 * 60)
>  # include "bench-string.h"
>
> +#define NO_OVERLAP 0
> +#define PARTIAL_OVERLAP 1
> +#define COMPLETE_OVERLAP 2
> +
>  IMPL (memmove, 1)
>  #endif
>
> @@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
>  }
>
>  static void
> -do_test (json_ctx_t *json_ctx, size_t len, bool overlap)
> +do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways)
>  {
> -  json_element_object_begin (json_ctx);
> -  json_attr_uint (json_ctx, "length", (double) len);
> -  json_array_begin (json_ctx, "timings");
> -
> -  if (overlap)
> -    buf2 = buf1;
> -
> -  FOR_EACH_IMPL (impl, 0)
> -    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
> -
> -  json_array_end (json_ctx);
> -  json_element_object_end (json_ctx);
> +  char *s1, *s2, *tmp;
> +  size_t repeats;
> +
> +  s1 = (char *) (buf1);
> +  s2 = (char *) (buf2);
> +  if (overlap != NO_OVERLAP)
> +    s2 = s1;
> +  if (overlap == PARTIAL_OVERLAP)
> +    s2 += len / 2;
> +
> +  for (repeats = both_ways ? 2 : 1; repeats; --repeats)
> +    {
> +      json_element_object_begin (json_ctx);
> +      json_attr_uint (json_ctx, "length", (double) len);
> +      json_attr_string(json_ctx, "overlap",
> +                       overlap == NO_OVERLAP        ? "none"
> +                       : overlap == PARTIAL_OVERLAP ? "partial"
> +                                                    : "complete");
> +      json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1));
> +      json_array_begin (json_ctx, "timings");
> +
> +
> +      FOR_EACH_IMPL (impl, 0)
> +        do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
> +
> +      json_array_end (json_ctx);
> +      json_element_object_end (json_ctx);
> +
> +      tmp = s1;
> +      s1 = s2;
> +      s2 = tmp;
> +    }
>  }
>
>  int
> @@ -107,15 +131,22 @@ test_main (void)
>    /* Non-overlapping buffers.  */
>    for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
>      {
> -      do_test (&json_ctx, i, false);
> -      do_test (&json_ctx, i + 1, false);
> +      do_test (&json_ctx, i, NO_OVERLAP, 1);
> +      do_test (&json_ctx, i + 1, NO_OVERLAP, 1);
> +    }
> +
> +  /* Partially-overlapping buffers.  */
> +  for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1)
> +    {
> +      do_test (&json_ctx, i, PARTIAL_OVERLAP, 1);
> +      do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1);
>      }
>
> -  /* Overlapping buffers.  */
> +  /* Complete-overlapping buffers.  */
>    for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
>      {
> -      do_test (&json_ctx, i, true);
> -      do_test (&json_ctx, i + 1, true);
> +      do_test (&json_ctx, i, COMPLETE_OVERLAP, 0);
> +      do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0);
>      }
>
>    json_array_end (&json_ctx);
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c
  2021-08-24  8:27 ` [PATCH 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein via Libc-alpha
@ 2021-08-24 15:19   ` H.J. Lu via Libc-alpha
  0 siblings, 0 replies; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-08-24 15:19 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Tue, Aug 24, 2021 at 1:28 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds more benchmarks for the common memcpy/memmove
> benchmarks. The most signifcant cases are the half page offsets. The
> current versions leaves dst and src near page aligned which leads to
> false 4k aliasing on x86_64. This can add noise due to false
> dependencies from one run to the next. As well, this seems like more
> of an edge case that common case so it shouldn't be the only thing
> benchmarked.
> ---
>  benchtests/bench-memcpy.c  | 42 ++++++++++++++++++++++++++++++++++----
>  benchtests/bench-memmove.c | 21 +++++++++++++++++--
>  2 files changed, 57 insertions(+), 6 deletions(-)
>
> diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
> index d9236a2282..b9e661c997 100644
> --- a/benchtests/bench-memcpy.c
> +++ b/benchtests/bench-memcpy.c
> @@ -60,11 +60,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
>    size_t i, j;
>    char *s1, *s2;
>    size_t repeats;
> -  align1 &= 63;
> +  align1 &= (getpagesize () - 1);
>    if (align1 + len >= page_size)
>      return;
>
> -  align2 &= 63;
> +  align2 &= (getpagesize () - 1);
>    if (align2 + len >= page_size)
>      return;
>
> @@ -99,7 +99,7 @@ test_main (void)
>  {
>    json_ctx_t json_ctx;
>    size_t i;
> -
> +  size_t half_page = getpagesize () / 2;
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -121,8 +121,15 @@ test_main (void)
>      {
>        do_test (&json_ctx, 0, 0, 1 << i, 1);
>        do_test (&json_ctx, i, 0, 1 << i, 1);
> +      do_test (&json_ctx, i + 32, 0, 1 << i, 1);
>        do_test (&json_ctx, 0, i, 1 << i, 1);
> +      do_test (&json_ctx, 0, i + 32, 1 << i, 1);
>        do_test (&json_ctx, i, i, 1 << i, 1);
> +      do_test (&json_ctx, i + 32, i + 32, 1 << i, 1);
> +      do_test (&json_ctx, half_page, 0, 1 << i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 1 << i, 1);
> +      do_test (&json_ctx, half_page, i, 1 << i, 1);
> +      do_test (&json_ctx, half_page + i, i, 1 << i, 1);
>      }
>
>    for (i = 0; i < 32; ++i)
> @@ -131,6 +138,12 @@ test_main (void)
>        do_test (&json_ctx, i, 0, i, 0);
>        do_test (&json_ctx, 0, i, i, 0);
>        do_test (&json_ctx, i, i, i, 0);
> +      do_test (&json_ctx, half_page, 0, i, 0);
> +      do_test (&json_ctx, half_page + i, 0, i, 0);
> +      do_test (&json_ctx, half_page, i, i, 0);
> +      do_test (&json_ctx, half_page + i, i, i, 0);
> +      do_test (&json_ctx, getpagesize () - 1, 0, i, 0);
> +      do_test (&json_ctx, 0, getpagesize () - 1, i, 0);
>      }
>
>    for (i = 3; i < 32; ++i)
> @@ -141,6 +154,10 @@ test_main (void)
>        do_test (&json_ctx, i, 0, 16 * i, 1);
>        do_test (&json_ctx, 0, i, 16 * i, 1);
>        do_test (&json_ctx, i, i, 16 * i, 1);
> +      do_test (&json_ctx, half_page, 0, 16 * i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 16 * i, 1);
> +      do_test (&json_ctx, half_page, i, 16 * i, 1);
> +      do_test (&json_ctx, half_page + i, i, 16 * i, 1);
>      }
>
>    for (i = 32; i < 64; ++i)
> @@ -149,16 +166,33 @@ test_main (void)
>        do_test (&json_ctx, i, 0, 32 * i, 1);
>        do_test (&json_ctx, 0, i, 32 * i, 1);
>        do_test (&json_ctx, i, i, 32 * i, 1);
> +      do_test (&json_ctx, half_page, 0, 32 * i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 32 * i, 1);
> +      do_test (&json_ctx, half_page, i, 32 * i, 1);
> +      do_test (&json_ctx, half_page + i, i, 32 * i, 1);
>      }
>
>    do_test (&json_ctx, 0, 0, getpagesize (), 1);
>
> -  for (i = 0; i <= 32; ++i)
> +  for (i = 0; i <= 48; ++i)
>      {
>        do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1);
>        do_test (&json_ctx, i, 0, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1);
>        do_test (&json_ctx, 0, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1);
>        do_test (&json_ctx, i, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i, 1, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, 1, i, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1);
> +      do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1);
>      }
>
>    json_array_end (&json_ctx);
> diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
> index 6becbf4782..bec1455f7b 100644
> --- a/benchtests/bench-memmove.c
> +++ b/benchtests/bench-memmove.c
> @@ -53,11 +53,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
>    size_t i, j;
>    char *s1, *s2;
>
> -  align1 &= 63;
> +  align1 &= (getpagesize () - 1);
>    if (align1 + len >= page_size)
>      return;
>
> -  align2 &= 63;
> +  align2 &= (getpagesize () - 1);
>    if (align2 + len >= page_size)
>      return;
>
> @@ -85,6 +85,7 @@ test_main (void)
>  {
>    json_ctx_t json_ctx;
>    size_t i;
> +  size_t half_page = getpagesize () / 2;
>
>    test_init ();
>
> @@ -138,6 +139,22 @@ test_main (void)
>        do_test (&json_ctx, i, i, 32 * i);
>      }
>
> +  for (i = 0; i <= 48; ++i)
> +    {
> +      do_test (&json_ctx, 0, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, i, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, 0, i, 2048 + 64 * i);
> +      do_test (&json_ctx, i, i, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, 0, half_page, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i);
> +      do_test (&json_ctx, i, half_page, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page, i, 2048 + 64 * i);
> +      do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i);
> +      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i);
> +      do_test (&json_ctx, i, half_page + i, 2048 + 64 * i);
> +    }
> +
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c
  2021-08-24  8:27 [PATCH 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein via Libc-alpha
                   ` (4 preceding siblings ...)
  2021-08-24 15:17 ` [PATCH 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu via Libc-alpha
@ 2021-08-24 19:32 ` Noah Goldstein via Libc-alpha
  2021-08-24 19:32   ` [PATCH v1 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
                     ` (3 more replies)
  5 siblings, 4 replies; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24 19:32 UTC (permalink / raw)
  To: libc-alpha

This commit updates the memcpy tests to test both dst > src and dst <
src. This is because there is logic in the code based on the
condition.
---
 string/test-memcpy.c  | 125 +++++++++++++++++++++++++++++++++---------
 string/test-memmove.c |  73 +++++++++++++++++++++++-
 2 files changed, 170 insertions(+), 28 deletions(-)

diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index c9dfc88fed..705d79ba13 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -79,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src,
 static void
 do_test (size_t align1, size_t align2, size_t len)
 {
-  size_t i, j;
+  size_t i, j, repeats;
   char *s1, *s2;
 
   align1 &= 4095;
@@ -92,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len)
 
   s1 = (char *) (buf1 + align1);
   s2 = (char *) (buf2 + align2);
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      for (i = 0, j = 1; i < len; i++, j += 23)
+        s1[i] = j;
 
-  for (i = 0, j = 1; i < len; i++, j += 23)
-    s1[i] = j;
-
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len);
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (impl, s2, s1, len);
+    }
 }
 
 static void
@@ -213,56 +215,88 @@ do_random_tests (void)
 }
 
 static void
-do_test1 (size_t size)
+do_test1 (size_t align1, size_t align2, size_t size)
 {
   void *large_buf;
-  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANON, -1, 0);
+  size_t mmap_size, region_size;
+
+  align1 &= (page_size - 1);
+  if (align1 == 0)
+    align1 = page_size;
+
+  align2 &= (page_size - 1);
+  if (align2 == 0)
+    align2 = page_size;
+
+  region_size = (size + page_size - 1) & (~(page_size - 1));
+
+  mmap_size = region_size * 2 + 3 * page_size;
+  large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANON, -1, 0);
   if (large_buf == MAP_FAILED)
     {
-      puts ("Failed to allocat large_buf, skipping do_test1");
+      puts ("Failed to allocate large_buf, skipping do_test1");
       return;
     }
-
-  if (mprotect (large_buf + size, page_size, PROT_NONE))
+  if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE))
     error (EXIT_FAILURE, errno, "mprotect failed");
 
-  size_t arrary_size = size / sizeof (uint32_t);
-  uint32_t *dest = large_buf;
-  uint32_t *src = large_buf + size + page_size;
+  size_t array_size = size / sizeof (uint32_t);
+  uint32_t *dest = large_buf + align1;
+  uint32_t *src = large_buf + region_size + 2 * page_size + align2;
   size_t i;
   size_t repeats;
   for(repeats = 0; repeats < 2; repeats++)
     {
-      for (i = 0; i < arrary_size; i++)
+      for (i = 0; i < array_size; i++)
         src[i] = (uint32_t) i;
-
       FOR_EACH_IMPL (impl, 0)
         {
-            printf ("\t\tRunning: %s\n", impl->name);
+            //            printf ("\t\tRunning: %s\n", impl->name);
           memset (dest, -1, size);
           CALL (impl, (char *) dest, (char *) src, size);
-          for (i = 0; i < arrary_size; i++)
+          for (i = 0; i < array_size; i++)
         if (dest[i] != src[i])
           {
             error (0, 0,
                "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
                impl->name, dest, src, i);
             ret = 1;
-            munmap ((void *) large_buf, size * 2 + page_size);
+            munmap ((void *) large_buf, mmap_size);
             return;
           }
         }
-      dest = src;
-      src = large_buf;
+      dest = large_buf + region_size + 2 * page_size + align1;
+      src = large_buf + align2;
+    }
+  munmap ((void *) large_buf, mmap_size);
+}
+
+static void
+do_random_large_tests (void)
+{
+  size_t i, align1, align2, size;
+  for (i = 0; i < 32; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 0x1000000) + 0x200000;
+      do_test1 (align1, align2, size);
+    }
+
+  for (i = 0; i < 128; ++i)
+    {
+      align1 = random ();
+      align2 = random ();
+      size = (random() % 32768) + 4096;
+      do_test1 (align1, align2, size);
     }
-  munmap ((void *) large_buf, size * 2 + page_size);
 }
 
 int
 test_main (void)
 {
-  size_t i;
+  size_t i, j;
 
   test_init ();
 
@@ -299,6 +333,7 @@ test_main (void)
   for (i = 19; i <= 25; ++i)
     {
       do_test (255, 0, 1 << i);
+      do_test (0, 4000, 1 << i);
       do_test (0, 255, i);
       do_test (0, 4000, i);
     }
@@ -307,8 +342,46 @@ test_main (void)
 
   do_random_tests ();
 
-  do_test1 (0x100000);
-  do_test1 (0x2000000);
+  do_test1 (0, 0, 0x100000);
+  do_test1 (0, 0, 0x2000000);
+
+  for (i = 4096; i < 32768; i += 4096)
+    {
+      for (j = 1; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+
+  for (i = 0x300000; i < 0x2000000; i += 0x235689)
+    {
+      for (j = 64; j <= 1024; j <<= 1)
+        {
+          do_test1 (0, j, i);
+          do_test1 (4095, j, i);
+          do_test1 (4096 - j, 0, i);
+
+          do_test1 (0, j - 1, i);
+          do_test1 (4095, j - 1, i);
+          do_test1 (4096 - j - 1, 0, i);
+
+          do_test1 (0, j + 1, i);
+          do_test1 (4095, j + 1, i);
+          do_test1 (4096 - j, 1, i);
+        }
+    }
+
+  do_random_large_tests ();
   return ret;
 }
 
diff --git a/string/test-memmove.c b/string/test-memmove.c
index 670094c9dc..5ba79acf61 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -101,11 +101,11 @@ do_test (size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize() - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize() - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -356,6 +356,51 @@ do_test3 (size_t bytes_move, size_t offset)
   munmap ((void *) buf, size);
 }
 
+static void
+do_test4 (size_t bytes_move, size_t offset1, size_t offset2)
+{
+  size_t size, repeats, i;
+  uint8_t *buf, *dst, *src;
+
+  size = bytes_move + MAX(offset1, offset2);
+  buf  = mmap(NULL, size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANON, -1, 0);
+
+  if (buf == MAP_FAILED)
+    error (EXIT_UNSUPPORTED, errno, "mmap failed");
+
+  dst = &buf[offset1];
+  src = &buf[offset2];
+  for (repeats = 0; repeats < 2; ++repeats)
+    {
+      FOR_EACH_IMPL (impl, 0)
+        {
+          for (i = 0; i < bytes_move; i++)
+              src[i] = (uint8_t) i;
+#ifdef TEST_BCOPY
+          CALL (impl, (char *) src, (char *) dst, bytes_move);
+#else
+          CALL (impl, (char *) dst, (char *) src, bytes_move);
+#endif
+          for (i = 0; i < bytes_move; i++)
+            {
+              if (dst[i] != (uint8_t) i)
+                {
+                  error (0, 0,
+                         "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+                         impl->name, dst, buf, i);
+                  ret = 1;
+                  break;
+                }
+            }
+        }
+      dst = &buf[offset2];
+      src = &buf[offset1];
+    }
+  munmap ((void *) buf, size);
+}
+
+
 int
 test_main (void)
 {
@@ -396,13 +441,37 @@ test_main (void)
 
   do_random_tests ();
 
+  do_test2 (0);
   do_test2 (33);
   do_test2 (0x200000);
+  do_test2 (0x200000 - 1);
+  do_test2 (0x200000 + 1);
+  do_test2 (0x1000000 + 1);
   do_test2 (0x4000000 - 1);
   do_test2 (0x4000000);
 
+
   /* Copy 16KB data.  */
   do_test3 (16384, 3);
+  for (i = 4096; i <= 16384; i <<= 1)
+    {
+      do_test4 (i, 0, i);
+      do_test4 (i, 0, i - 1);
+      do_test4 (i, 0, i + 1);      
+      do_test4 (i, 63, i + 63);
+      do_test4 (i, 63, i + 64);
+      do_test4 (i, 63, i);
+
+      do_test4 (i, 0, 1);
+      do_test4 (i, 0, 15);
+      do_test4 (i, 0, 31);
+      do_test4 (i, 0, 63);
+      do_test4 (i, 0, 64);
+      do_test4 (i, 0, 65);
+      do_test4 (i, 0, 127);
+      do_test4 (i, 0, 129);
+    }
+
 
   return ret;
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v1 2/5] benchtests: Add new random cases to bench-memcpy-random.c
  2021-08-24 19:32 ` [PATCH v1 " Noah Goldstein via Libc-alpha
@ 2021-08-24 19:32   ` Noah Goldstein via Libc-alpha
  2021-08-24 19:32   ` [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein via Libc-alpha
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24 19:32 UTC (permalink / raw)
  To: libc-alpha

This commit adds three new benchmarks for the SPEC2017
distribution. One randomized if dst > src and the other two set it
either 1/0.

As well add some tests for fixed sizes with randomize alignment and
value of dst > src. This can be useful for testing different alignment
configurations.
---
 benchtests/bench-memcpy-random.c | 103 +++++++++++++++++++++++++++----
 1 file changed, 92 insertions(+), 11 deletions(-)

diff --git a/benchtests/bench-memcpy-random.c b/benchtests/bench-memcpy-random.c
index c490b73ed0..eeeef42fc1 100644
--- a/benchtests/bench-memcpy-random.c
+++ b/benchtests/bench-memcpy-random.c
@@ -16,7 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define MIN_PAGE_SIZE (512*1024+getpagesize())
+#define MAX_TEST_SIZE (512*1024)
+#define MIN_PAGE_SIZE (3*MAX_TEST_SIZE+getpagesize())
 #define TEST_MAIN
 #define TEST_NAME "memcpy"
 #include "bench-string.h"
@@ -89,9 +90,12 @@ static align_data_t dst_align_freq[] =
 
 typedef struct
 {
-  uint64_t src : 24;
-  uint64_t dst : 24;
-  uint64_t len : 16;
+/* 26 bits for src and dst so we have extra bit for alternating dst >
+   src without a branch.  */
+  uint64_t src : 26;
+  uint64_t dst : 26;
+/* For size < 4096 12 bits is enough.  */
+  uint64_t len : 12;
 } copy_t;
 
 static copy_t copy[MAX_COPIES];
@@ -142,34 +146,100 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 }
 
 static void
-do_test (json_ctx_t *json_ctx, size_t max_size)
+do_one_fixed_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
+               copy_t *copy, size_t n, size_t size)
 {
-  int i;
+  timing_t start, stop, cur;
+  size_t iters = INNER_LOOP_ITERS_SMALL;
+
+  for (int j = 0; j < n; j++)
+    CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
+
+  TIMING_NOW (start);
+  for (int i = 0; i < iters; ++i)
+    for (int j = 0; j < n; j++)
+      CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  json_element_double (json_ctx, (double) cur / (double) iters);
+}
+
+
+static size_t
+init_copy(size_t max_size, int dst_gt_src)
+{
+  size_t i, dst_offset, src_offset;
+  if (dst_gt_src <= 0)
+    {
+      dst_offset = 0;
+      src_offset = max_size;
+    }
+  else
+    {
+      dst_offset = max_size;
+      src_offset = 0;
+    }
 
-  memset (buf1, 1, max_size);
 
   /* Create a random set of copies with the given size and alignment
      distributions.  */
   for (i = 0; i < MAX_COPIES; i++)
     {
+      dst_offset  = dst_gt_src == -1
+                        ? (rand() & 1) ? max_size : 0
+                        : dst_offset;
       copy[i].dst = (rand () & (max_size - 1));
       copy[i].dst &= ~dst_align_arr[rand () & ALIGN_MASK];
+      copy[i].dst += dst_offset;
       copy[i].src = (rand () & (max_size - 1));
       copy[i].src &= ~src_align_arr[rand () & ALIGN_MASK];
+      copy[i].src += src_offset;
       copy[i].len = size_arr[rand () & SIZE_MASK];
     }
+  memset (buf1, 1, 3 * max_size);
+  return i;
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t max_size, int dst_gt_src)
+{
+  size_t n;
+  n = init_copy(max_size, dst_gt_src);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "region-size", (double) 3 * max_size);
+  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
+  json_attr_uint (json_ctx, "with-fixed-size", (double) 0);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_test (json_ctx, impl, (char *) buf1, (char *) buf1, copy, n);
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
 
+static void
+do_test_fixed_size (json_ctx_t *json_ctx, size_t size, size_t max_size, int dst_gt_src)
+{
+  size_t n;
+  n = init_copy(3 * max_size, dst_gt_src);
   json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) max_size);
+  json_attr_uint (json_ctx, "region-size", (double) 3 * max_size);
+  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
+  json_attr_uint (json_ctx, "with-fixed-size", (double) 1);
+  json_attr_uint (json_ctx, "size", (double) size);
   json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, i);
+    do_one_fixed_test (json_ctx, impl, (char *) buf1, (char *) buf1, copy, n, size);
 
   json_array_end (json_ctx);
   json_element_object_end (json_ctx);
 }
 
+
 int
 test_main (void)
 {
@@ -193,8 +263,19 @@ test_main (void)
   json_array_end (&json_ctx);
 
   json_array_begin (&json_ctx, "results");
-  for (int i = 4; i <= 512; i = i * 2)
-    do_test (&json_ctx, i * 1024);
+  for (int i = 4096; i < MAX_TEST_SIZE; i = i * 2)
+    {
+      do_test (&json_ctx, i, 0);
+      do_test (&json_ctx, i, 1);
+      do_test (&json_ctx, i, -1);
+    }
+
+  for (int i = 4096; i <= 65536; i = i * 2)
+    {
+      do_test_fixed_size (&json_ctx, i, i, 0);
+      do_test_fixed_size (&json_ctx, i, i, 1);
+      do_test_fixed_size (&json_ctx, i, i, -1);
+    }
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c
  2021-08-24 19:32 ` [PATCH v1 " Noah Goldstein via Libc-alpha
  2021-08-24 19:32   ` [PATCH v1 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
@ 2021-08-24 19:32   ` Noah Goldstein via Libc-alpha
  2021-08-24 19:32   ` [PATCH v1 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein via Libc-alpha
  2021-08-24 19:32   ` [PATCH v1 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S Noah Goldstein via Libc-alpha
  3 siblings, 0 replies; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24 19:32 UTC (permalink / raw)
  To: libc-alpha

This commit adds a new partial overlap benchmark. This is generally
the most interesting performance case for memmove and was missing.
---
 benchtests/bench-memmove-walk.c | 67 ++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c
index b5fdb2a422..18b716f5cb 100644
--- a/benchtests/bench-memmove-walk.c
+++ b/benchtests/bench-memmove-walk.c
@@ -36,6 +36,10 @@
 # define TIMEOUT (20 * 60)
 # include "bench-string.h"
 
+#define NO_OVERLAP 0
+#define PARTIAL_OVERLAP 1
+#define COMPLETE_OVERLAP 2
+
 IMPL (memmove, 1)
 #endif
 
@@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 }
 
 static void
-do_test (json_ctx_t *json_ctx, size_t len, bool overlap)
+do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways)
 {
-  json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) len);
-  json_array_begin (json_ctx, "timings");
-
-  if (overlap)
-    buf2 = buf1;
-
-  FOR_EACH_IMPL (impl, 0)
-    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
-
-  json_array_end (json_ctx);
-  json_element_object_end (json_ctx);
+  char *s1, *s2, *tmp;    
+  size_t repeats;
+
+  s1 = (char *) (buf1);
+  s2 = (char *) (buf2);
+  if (overlap != NO_OVERLAP)
+    s2 = s1;
+  if (overlap == PARTIAL_OVERLAP)
+    s2 += len / 2;
+
+  for (repeats = both_ways ? 2 : 1; repeats; --repeats)
+    {    
+      json_element_object_begin (json_ctx);
+      json_attr_uint (json_ctx, "length", (double) len);
+      json_attr_string(json_ctx, "overlap",
+                       overlap == NO_OVERLAP        ? "none"
+                       : overlap == PARTIAL_OVERLAP ? "partial"
+                                                    : "complete");
+      json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1));      
+      json_array_begin (json_ctx, "timings");
+
+
+      FOR_EACH_IMPL (impl, 0)
+        do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
+
+      json_array_end (json_ctx);
+      json_element_object_end (json_ctx);
+
+      tmp = s1;
+      s1 = s2;
+      s2 = tmp;
+    }
 }
 
 int
@@ -107,15 +131,22 @@ test_main (void)
   /* Non-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, false);
-      do_test (&json_ctx, i + 1, false);
+      do_test (&json_ctx, i, NO_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, NO_OVERLAP, 1);
+    }
+
+  /* Partially-overlapping buffers.  */
+  for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1)
+    {
+      do_test (&json_ctx, i, PARTIAL_OVERLAP, 1);
+      do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1);
     }
 
-  /* Overlapping buffers.  */
+  /* Complete-overlapping buffers.  */
   for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
     {
-      do_test (&json_ctx, i, true);
-      do_test (&json_ctx, i + 1, true);
+      do_test (&json_ctx, i, COMPLETE_OVERLAP, 0);
+      do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0);
     }
 
   json_array_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v1 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c
  2021-08-24 19:32 ` [PATCH v1 " Noah Goldstein via Libc-alpha
  2021-08-24 19:32   ` [PATCH v1 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
  2021-08-24 19:32   ` [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein via Libc-alpha
@ 2021-08-24 19:32   ` Noah Goldstein via Libc-alpha
  2021-08-24 19:32   ` [PATCH v1 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S Noah Goldstein via Libc-alpha
  3 siblings, 0 replies; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24 19:32 UTC (permalink / raw)
  To: libc-alpha

This commit adds more benchmarks for the common memcpy/memmove
benchmarks. The most signifcant cases are the half page offsets. The
current versions leaves dst and src near page aligned which leads to
false 4k aliasing on x86_64. This can add noise due to false
dependencies from one run to the next. As well, this seems like more
of an edge case that common case so it shouldn't be the only thing
benchmarked.
---
 benchtests/bench-memcpy.c  | 42 ++++++++++++++++++++++++++++++++++----
 benchtests/bench-memmove.c | 21 +++++++++++++++++--
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
index d9236a2282..b9e661c997 100644
--- a/benchtests/bench-memcpy.c
+++ b/benchtests/bench-memcpy.c
@@ -60,11 +60,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
   size_t i, j;
   char *s1, *s2;
   size_t repeats;
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -99,7 +99,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
-
+  size_t half_page = getpagesize () / 2;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -121,8 +121,15 @@ test_main (void)
     {
       do_test (&json_ctx, 0, 0, 1 << i, 1);
       do_test (&json_ctx, i, 0, 1 << i, 1);
+      do_test (&json_ctx, i + 32, 0, 1 << i, 1);
       do_test (&json_ctx, 0, i, 1 << i, 1);
+      do_test (&json_ctx, 0, i + 32, 1 << i, 1);
       do_test (&json_ctx, i, i, 1 << i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 1 << i, 1);
+      do_test (&json_ctx, half_page, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, 0, 1 << i, 1);
+      do_test (&json_ctx, half_page, i, 1 << i, 1);
+      do_test (&json_ctx, half_page + i, i, 1 << i, 1);
     }
 
   for (i = 0; i < 32; ++i)
@@ -131,6 +138,12 @@ test_main (void)
       do_test (&json_ctx, i, 0, i, 0);
       do_test (&json_ctx, 0, i, i, 0);
       do_test (&json_ctx, i, i, i, 0);
+      do_test (&json_ctx, half_page, 0, i, 0);
+      do_test (&json_ctx, half_page + i, 0, i, 0);
+      do_test (&json_ctx, half_page, i, i, 0);
+      do_test (&json_ctx, half_page + i, i, i, 0);
+      do_test (&json_ctx, getpagesize () - 1, 0, i, 0);
+      do_test (&json_ctx, 0, getpagesize () - 1, i, 0);
     }
 
   for (i = 3; i < 32; ++i)
@@ -141,6 +154,10 @@ test_main (void)
       do_test (&json_ctx, i, 0, 16 * i, 1);
       do_test (&json_ctx, 0, i, 16 * i, 1);
       do_test (&json_ctx, i, i, 16 * i, 1);
+      do_test (&json_ctx, half_page, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 16 * i, 1);
+      do_test (&json_ctx, half_page, i, 16 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 16 * i, 1);
     }
 
   for (i = 32; i < 64; ++i)
@@ -149,16 +166,33 @@ test_main (void)
       do_test (&json_ctx, i, 0, 32 * i, 1);
       do_test (&json_ctx, 0, i, 32 * i, 1);
       do_test (&json_ctx, i, i, 32 * i, 1);
+      do_test (&json_ctx, half_page, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 32 * i, 1);
+      do_test (&json_ctx, half_page, i, 32 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 32 * i, 1);
     }
 
   do_test (&json_ctx, 0, 0, getpagesize (), 1);
 
-  for (i = 0; i <= 32; ++i)
+  for (i = 0; i <= 48; ++i)
     {
       do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1);
       do_test (&json_ctx, 0, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1);
       do_test (&json_ctx, i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i, 2048 + 64 * i, 1);
+      do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1);
+      do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1);
     }
 
   json_array_end (&json_ctx);
diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
index 6becbf4782..bec1455f7b 100644
--- a/benchtests/bench-memmove.c
+++ b/benchtests/bench-memmove.c
@@ -53,11 +53,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= (getpagesize () - 1);
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= (getpagesize () - 1);
   if (align2 + len >= page_size)
     return;
 
@@ -85,6 +85,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
+  size_t half_page = getpagesize () / 2;
 
   test_init ();
 
@@ -138,6 +139,22 @@ test_main (void)
       do_test (&json_ctx, i, i, 32 * i);
     }
 
+  for (i = 0; i <= 48; ++i)
+    {
+      do_test (&json_ctx, 0, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, 0, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page, 2048 + 64 * i);
+      do_test (&json_ctx, half_page, i, 2048 + 64 * i);
+      do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i);
+      do_test (&json_ctx, half_page + i, i, 2048 + 64 * i);
+      do_test (&json_ctx, i, half_page + i, 2048 + 64 * i);
+    }
+
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v1 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S
  2021-08-24 19:32 ` [PATCH v1 " Noah Goldstein via Libc-alpha
                     ` (2 preceding siblings ...)
  2021-08-24 19:32   ` [PATCH v1 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein via Libc-alpha
@ 2021-08-24 19:32   ` Noah Goldstein via Libc-alpha
  3 siblings, 0 replies; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24 19:32 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes memmove-vec-unaligned.S.

The optimizations are in descending order of importance to the
L(less_vec), L(movsb), the 8x forward/backward loops and various
target alignments that have minimal code size impact.

The L(less_vec) optimizations are to:

    1. Readjust the branch order to either given hotter paths a fall
    through case or have less branches in there way.
    2. Moderately change the size classes to make hot branches hotter
    and thus increase predictability.
    3. Try and minimize branch aliasing to avoid BPU thrashing based
    misses.
    4. 64 byte the prior function entry. This is to avoid cases where
    seemingly unrelated changes end up have severe negative
    performance impacts.

The L(movsb) optimizations are to:

    1. Reduce the number of taken branches needed to determine if
    movsb should be used.
    2. 64 byte align either dst if the CPU has fsrm or if dst and src
    do not 4k alias.
    3. 64 byte align src if the CPU does not have fsrm and dst and src
    do 4k alias.

The 8x forward/backward loop optimizations are to:

    1. Reduce instructions needed for aligning to VEC_SIZE.
    2. Reduce uops and code size of the loops.

All tests in string/ passing.
---
 sysdeps/x86/sysdep.h                          |  13 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 484 +++++++++++-------
 2 files changed, 317 insertions(+), 180 deletions(-)

diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index cac1d762fb..9226d2c6c9 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -78,15 +78,18 @@ enum cf_protection_level
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
 
 /* Define an entry point visible from C.  */
-#define	ENTRY(name)							      \
-  .globl C_SYMBOL_NAME(name);						      \
-  .type C_SYMBOL_NAME(name),@function;					      \
-  .align ALIGNARG(4);							      \
+#define	P2ALIGN_ENTRY(name, alignment)							      \
+  .globl C_SYMBOL_NAME(name);							      \
+  .type C_SYMBOL_NAME(name),@function;							      \
+  .align ALIGNARG(alignment);							      \
   C_LABEL(name)								      \
   cfi_startproc;							      \
-  _CET_ENDBR;								      \
+  _CET_ENDBR;							      \
   CALL_MCOUNT
 
+#define	ENTRY(name) P2ALIGN_ENTRY(name, 4)
+
+
 #undef	END
 #define END(name)							      \
   cfi_endproc;								      \
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 9f02624375..75b6efe969 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -165,6 +165,32 @@
 # error Invalid LARGE_LOAD_SIZE
 #endif
 
+/* Whether to align before movsb. Ultimately we want 64 byte align
+   and not worth it to load 4x VEC for VEC_SIZE == 16.  */
+#define ALIGN_MOVSB	(VEC_SIZE	>	16)
+
+/* Number of VECs to align movsb to.  */
+#if VEC_SIZE == 64
+# define MOVSB_ALIGN_TO	(VEC_SIZE)
+#else
+# define MOVSB_ALIGN_TO	(VEC_SIZE	*	2)
+#endif
+
+/* Macro for copying inclusive power of 2 range with two register
+   loads.  */
+#define COPY_BLOCK(mov_inst, src_reg, dst_reg, size_reg, len, tmp_reg0, tmp_reg1)	\
+	mov_inst (%src_reg), %tmp_reg0; \
+	mov_inst -(len)(%src_reg, %size_reg), %tmp_reg1; \
+	mov_inst %tmp_reg0, (%dst_reg); \
+	mov_inst %tmp_reg1, -(len)(%dst_reg, %size_reg);
+
+/* Define all copies used by L(less_vec) for VEC_SIZE of 16, 32, or
+   64.  */
+#define COPY_4_8	COPY_BLOCK(movl, rsi, rdi, rdx, 4, ecx, esi)
+#define COPY_8_16	COPY_BLOCK(movq, rsi, rdi, rdx, 8, rcx, rsi)
+#define COPY_16_32	COPY_BLOCK(vmovdqu, rsi, rdi, rdx, 16, xmm0, xmm1)
+#define COPY_32_64	COPY_BLOCK(vmovdqu64, rsi, rdi, rdx, 32, ymm16, ymm17)
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -198,7 +224,13 @@ L(start):
 	movl	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
+	/* Based on SPEC2017 distribution both 16 and 32 memcpy calls are
+	   really hot so we want them to take the same branch path.  */
+#if VEC_SIZE > 16
+	jbe	L(less_vec)
+#else
 	jb	L(less_vec)
+#endif
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 #if !defined USE_MULTIARCH || !IS_IN (libc)
@@ -206,15 +238,10 @@ L(last_2x_vec):
 #endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-	ret
-#else
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
-#endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 
@@ -289,7 +316,9 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+/* Cache align entry so that branch heavy L(less_vec) maintains good
+   alignment.  */
+P2ALIGN_ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 	movq	%rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
@@ -297,123 +326,217 @@ L(start_erms):
 	movl	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
+	/* Based on SPEC2017 distribution both 16 and 32 memcpy calls are
+	   really hot so we want them to take the same branch path.  */
+# if VEC_SIZE > 16
+	jbe	L(less_vec)
+# else
 	jb	L(less_vec)
+# endif
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
 	ret
+# endif
 #endif
+#if VEC_SIZE == 64
+L(copy_8_15):
+	COPY_8_16
+	ret
 
-L(movsb):
-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-	jb	L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rdi, %rcx
-	subq	%rsi, %rcx
-	jmp	2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rsi, %rcx
-	subq	%rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
-   is N*4GB + [1..63] with N >= 0.  */
-	cmpl	$63, %ecx
-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
-3:
-# endif
-	mov	%RDX_LP, %RCX_LP
-	rep movsb
-L(nop):
+L(copy_33_63):
+	COPY_32_64
 	ret
 #endif
-
+	/* Only worth aligning if near end of 16 byte block and won't get
+	   first branch in first decode after jump.  */
+	.p2align 4,, 6
 L(less_vec):
-	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
-#if VEC_SIZE > 32
-	cmpb	$32, %dl
-	jae	L(between_32_63)
+	/* Second set of branches for smallest copies.  */
+	cmpl	$(VEC_SIZE / 4), %edx
+	jb	L(less_quarter_vec)
+
+	cmpl	$(VEC_SIZE / 2), %edx
+#if VEC_SIZE == 64
+	/* We branch to [33, 63] instead of [16, 32] to give [16, 32] fall
+	   through path as [16, 32] is hotter.  */
+	ja	L(copy_33_63)
+	COPY_16_32
+#elif VEC_SIZE == 32
+	/* Branch to [8, 15]. Fall through to [16, 32].  */
+	jb	L(copy_8_15)
+	COPY_16_32
+#else
+	/* Branch to [4, 7]. Fall through to [8, 15].  */
+	jb	L(copy_4_7)
+	COPY_8_16
 #endif
-#if VEC_SIZE > 16
-	cmpb	$16, %dl
-	jae	L(between_16_31)
-#endif
-	cmpb	$8, %dl
-	jae	L(between_8_15)
-	cmpb	$4, %dl
-	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
+	ret
+	/* Align if won't cost too many bytes.  */
+	.p2align 4,, 6
+L(copy_4_7):
+	COPY_4_8
+	ret
+
+	/* Cold target. No need to align.  */
+L(copy_1):
 	movzbl	(%rsi), %ecx
 	movb	%cl, (%rdi)
-1:
 	ret
+
+	/* Colder copy case for [0, VEC_SIZE / 4 - 1].  */
+L(less_quarter_vec):
 #if VEC_SIZE > 32
-L(between_32_63):
-	/* From 32 to 63.  No branch when size == 32.  */
-	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi,%rdx), %YMM1
-	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi,%rdx)
-	VZEROUPPER_RETURN
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
 #endif
 #if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
-L(between_16_31):
-	VMOVU	(%rsi), %XMM0
-	VMOVU	-16(%rsi,%rdx), %XMM1
-	VMOVU	%XMM0, (%rdi)
-	VMOVU	%XMM1, -16(%rdi,%rdx)
-	VZEROUPPER_RETURN
-#endif
-L(between_8_15):
-	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
-	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
-	movq	%rsi, (%rdi)
-	ret
-L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+#endif
+	cmpl	$1, %edx
+	je	L(copy_1)
+	jb	L(copy_0)
+	/* Fall through into copy [2, 3] as it is more common than [0, 1].
+	 */
+	movzwl	(%rsi), %ecx
+	movzbl	-1(%rsi, %rdx), %esi
+	movw	%cx, (%rdi)
+	movb	%sil, -1(%rdi, %rdx)
+L(copy_0):
 	ret
-L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
+
+	.p2align 4
+#if VEC_SIZE == 32
+L(copy_8_15):
+	COPY_8_16
 	ret
+	/* COPY_8_16 is exactly 17 bytes so don't want to p2align after as
+	   it wastes 15 bytes of code and 1 byte off is fine.  */
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+L(movsb):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward movsb is slow.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+	/* If above __x86_rep_movsb_stop_threshold most likely is candidate
+	   for NT moves aswell.  */
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	jae	L(large_memcpy_2x_check)
+# if ALIGN_MOVSB
+	VMOVU	(%rsi), %VEC(0)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* Store dst for use after rep movsb.  */
+	movq	%rdi, %r8
+# endif
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+	/* Only avoid short movsb if CPU has FSRM.  */
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	L(skip_short_movsb_check)
+	/* Avoid "rep movsb" if RCX, the distance between source and
+	   destination, is N*4GB + [1..63] with N >= 0.  */
+
+	/* ecx contains dst - src. Early check for backward copy conditions
+	   means only case of slow movsb with src = dst + [0, 63] is ecx in
+	   [-63, 0]. Use unsigned comparison with -64 check for that case.  */
+	cmpl	$-64, %ecx
+	ja	L(more_8x_vec_forward)
+# endif
+# if ALIGN_MOVSB
+	/* Fall through means cpu has FSRM. In that case exclusively align
+	   destination.  */
+
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Add dst to len. Subtract back after dst aligned.  */
+	leaq	(%rdi, %rdx), %rcx
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
+	andq	$-(MOVSB_ALIGN_TO), %rdi
+	/* Restore src and len adjusted with new values for aligned dst.  */
+	addq	%rdi, %rsi
+	subq	%rdi, %rcx
 
+	rep	movsb
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+L(movsb_align_dst):
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Add dst to len. Subtract back after dst aligned. -1 because dst
+	   is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-(1)(%rdi, %rdx), %rcx
+	/* Inclusively align dst to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rdi
+	leaq	1(%rdi, %rsi), %rsi
+	/* Restore src and len adjusted with new values for aligned dst.  */
+	subq	%rdi, %rcx
+	/* Finish aligning dst.  */
+	incq	%rdi
+	rep	movsb
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+
+L(skip_short_movsb_check):
+	/* If CPU does not have FSRM two options for aligning. Align src if
+	   dst and src 4k alias. Otherwise align dst.  */
+	testl	$(PAGE_SIZE - 512), %ecx
+	jnz	L(movsb_align_dst)
+	/* rcx already has dst - src.  */
+	movq	%rcx, %r9
+	/* Add src to len. Subtract back after src aligned. -1 because src
+	   is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-(1)(%rsi, %rdx), %rcx
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
+	/* Restore dst and len adjusted with new values for aligned dst.  */
+	leaq	1(%rsi, %r9), %rdi
+	subq	%rsi, %rcx
+	/* Finish aligning src.  */
+	incq	%rsi
+	rep	movsb
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# else
+	/* Not alignined rep movsb so just copy.  */
+	mov	%RDX_LP, %RCX_LP
+	rep	movsb
+	ret
+# endif
+#endif
+	/* Align if doesn't cost too many bytes.  */
+	.p2align 4,, 6
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
@@ -426,50 +549,60 @@ L(more_2x_vec):
 	ja	L(more_8x_vec)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
+	/* Align if doesn't cost too much code size. 6 bytes so that after
+	   jump to target a full mov instruction will always be able to be
+	   fetched.  */
+	.p2align 4,, 6
 L(last_4x_vec):
-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	/* Keep nop target close to jmp for 2-byte encoding.  */
+L(nop):
 	VZEROUPPER_RETURN
-
+	/* Align if doesn't cost too much code size.  */
+	.p2align 4,, 10
 L(more_8x_vec):
 	/* Check if non-temporal move candidate.  */
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_memcpy_2x)
 #endif
-	/* Entry if rdx is greater than non-temporal threshold but there
-       is overlap.  */
+	/* Entry if rdx is greater than non-temporal threshold but there is
+	   overlap.  */
 L(more_8x_vec_check):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
 	/* Source == destination is less common.  */
 	je	L(nop)
+	/* Entry if rdx is greater than movsb or stop movsb threshold but
+	   there is overlap with dst > src.  */
+L(more_8x_vec_forward):
 	/* Load the first VEC and last 4 * VEC to support overlapping
 	   addresses.  */
 	VMOVU	(%rsi), %VEC(4)
@@ -477,22 +610,18 @@ L(more_8x_vec_check):
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
-
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rdi, %rdx), %rdx
+	/* Save begining of dst.  */
+	movq	%rdi, %rcx
+	/* Align dst to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	/* Restore src adjusted with new value for aligned dst.  */
+	leaq	1(%rdi, %rsi), %rsi
+	/* Finish aligning dst.  */
+	incq	%rdi
 	.p2align 4
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
@@ -501,23 +630,27 @@ L(loop_4x_vec_forward):
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
 	subq	$-(VEC_SIZE * 4), %rsi
-	addq	$-(VEC_SIZE * 4), %rdx
 	VMOVA	%VEC(0), (%rdi)
 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
+	VMOVU	%VEC(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	%VEC(4), (%rcx)
+	/* Keep nop target close to jmp for 2-byte encoding.  */
+L(nop2):
 	VZEROUPPER_RETURN
-
+	/* Entry from fail movsb. Need to test if dst - src == 0 still.  */
+L(more_8x_vec_backward_check_nop):
+	testq	%rcx, %rcx
+	jz	L(nop2)
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
 	   addresses.  */
@@ -525,49 +658,50 @@ L(more_8x_vec_backward):
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
-
-	.p2align 4
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Save begining of buffer.  */
+	movq	%rdi, %rcx
+	/* Set dst to begining of region to copy. -1 for inclusive
+	   alignment.  */
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rdi
+	/* Align dst.  */
+	andq	$-(VEC_SIZE), %rdi
+	/* Restore src.  */
+	addq	%rdi, %rsi
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	addq	$-(VEC_SIZE * 4), %rcx
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	addq	$-(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(0)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * -4), %rsi
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VEC(1), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 0)(%rdi)
+	addq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rdi, %rcx
+	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), (%rcx)
+	VMOVU	%VEC(5), VEC_SIZE(%rcx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rcx)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rcx)
 	VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	.p2align 4
+	/* Entry if dst > stop movsb threshold (usually set to non-temporal
+	   threshold).  */
+L(large_memcpy_2x_check):
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_forward)
 L(large_memcpy_2x):
 	/* Compute absolute value of difference between source and
 	   destination.  */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2021-08-24 19:36 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-24  8:27 [PATCH 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein via Libc-alpha
2021-08-24  8:27 ` [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
2021-08-24 15:18   ` H.J. Lu via Libc-alpha
2021-08-24  8:27 ` [PATCH 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein via Libc-alpha
2021-08-24 15:18   ` H.J. Lu via Libc-alpha
2021-08-24  8:27 ` [PATCH 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein via Libc-alpha
2021-08-24 15:19   ` H.J. Lu via Libc-alpha
2021-08-24  8:27 ` [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S Noah Goldstein via Libc-alpha
2021-08-24  9:12   ` Noah Goldstein via Libc-alpha
2021-08-24 15:17 ` [PATCH 1/5] string: Make tests birdirectional test-memcpy.c H.J. Lu via Libc-alpha
2021-08-24 19:32 ` [PATCH v1 " Noah Goldstein via Libc-alpha
2021-08-24 19:32   ` [PATCH v1 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
2021-08-24 19:32   ` [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Noah Goldstein via Libc-alpha
2021-08-24 19:32   ` [PATCH v1 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Noah Goldstein via Libc-alpha
2021-08-24 19:32   ` [PATCH v1 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S Noah Goldstein via Libc-alpha

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).