[PATCH] Add malloc micro benchmark

unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] Add malloc micro benchmark
@ 2017-12-01 13:51 Wilco Dijkstra
  2017-12-01 16:13 ` Carlos O'Donell
  0 siblings, 1 reply; 15+ messages in thread
From: Wilco Dijkstra @ 2017-12-01 13:51 UTC (permalink / raw)
  To: libc-alpha@sourceware.org; +Cc: nd

Add a malloc micro benchmark to enable accurate testing of the
various paths in malloc and free.  The benchmark does a varying
number of allocations of a given block size, then frees them again.
It does so for several block sizes and number of allocated blocks.
Although the test is single-threaded, it also tests what happens
when you disable single-threaded fast paths (ie. SINGLE_THREAD_P
is false).

OK for commit?

Typical output on an x64 box:
{
 "timing_type": "hp_timing",
 "functions": {
  "malloc": {
   "malloc_block_size_0016": {
    "st_num_allocs_0025_time": 53.5486,
    "st_num_allocs_0100_time": 57.2553,
    "st_num_allocs_0400_time": 57.3204,
    "st_num_allocs_1000_time": 57.2059,
    "mt_num_allocs_0025_time": 87.7903,
    "mt_num_allocs_0100_time": 100.772,
    "mt_num_allocs_0400_time": 103.827,
    "mt_num_allocs_1000_time": 104.812
   },
   "malloc_block_size_0256": {
    "st_num_allocs_0025_time": 78.3056,
    "st_num_allocs_0100_time": 85.6392,
    "st_num_allocs_0400_time": 91.5187,
    "st_num_allocs_1000_time": 163.458,
    "mt_num_allocs_0025_time": 115.925,
    "mt_num_allocs_0100_time": 140.735,
    "mt_num_allocs_0400_time": 152.044,
    "mt_num_allocs_1000_time": 225.118
   },
   "malloc_block_size_1024": {
    "st_num_allocs_0025_time": 113.705,
    "st_num_allocs_0100_time": 103.79,
    "st_num_allocs_0400_time": 479.029,
    "st_num_allocs_1000_time": 634.228,
    "mt_num_allocs_0025_time": 145.807,
    "mt_num_allocs_0100_time": 151.157,
    "mt_num_allocs_0400_time": 526.499,
    "mt_num_allocs_1000_time": 687.357
   },
   "malloc_block_size_4096": {
    "st_num_allocs_0025_time": 105.101,
    "st_num_allocs_0100_time": 1640.23,
    "st_num_allocs_0400_time": 2411.26,
    "st_num_allocs_1000_time": 2641.56,
    "mt_num_allocs_0025_time": 156.323,
    "mt_num_allocs_0100_time": 1702.94,
    "mt_num_allocs_0400_time": 2453,
    "mt_num_allocs_1000_time": 2676.75
   }
  }
 }
}

Note something very bad happens for the larger allocations, there
is a 25x slowdown from 25 to 400 allocations of 4KB blocks...

ChangeLog:
2017-12-01  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/Makefile: Add malloc-simple benchmark.
	* benchtests/bench-malloc-simple.c: New benchmark.
--

diff --git a/benchtests/Makefile b/benchtests/Makefile
index d8681fce8cf399bc655f3f6a7717897eb9c30619..a4b2573cfa706bd6369063a995d512e0947c7bd5 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -67,8 +67,10 @@ stdio-common-benchset := sprintf
 
 math-benchset := math-inlines
 
+malloc-benchset := malloc-simple
+
 benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
-	    $(math-benchset)
+	    $(math-benchset) $(malloc-benchset)
 
 CFLAGS-bench-ffs.c += -fno-builtin
 CFLAGS-bench-ffsll.c += -fno-builtin
@@ -86,6 +88,7 @@ $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
 $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
 $(objpfx)bench-malloc-thread: $(shared-thread-library)
+$(objpfx)bench-malloc-simple: $(shared-thread-library)
 
 \f
 
diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
new file mode 100644
index 0000000000000000000000000000000000000000..e786ddd9635f835b2f01b00a80f3cf0d2de82d48
--- /dev/null
+++ b/benchtests/bench-malloc-simple.c
@@ -0,0 +1,152 @@
+/* Benchmark malloc and free functions.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "bench-timing.h"
+#include "json-lib.h"
+
+#define NUM_ITERS 1000000
+#define NUM_ALLOCS 4
+#define NUM_SIZES  4
+#define MAX_ALLOCS 1000
+
+typedef struct
+{
+  size_t iters;
+  size_t size;
+  int n;
+  timing_t elapsed;
+} malloc_args;
+
+static void
+do_benchmark (malloc_args *args, int **arr)
+{
+  timing_t start, stop;
+  size_t iters = args->iters;
+  size_t size = args->size;
+  int n = args->n;
+
+  TIMING_NOW (start);
+
+  for (int j = 0; j < iters; j++)
+    {
+      for (int i = 0; i < n; i++)
+        arr[i] = malloc (size);
+
+      for (int i = 0; i < n; i++)
+        free (arr[i]);
+    }
+
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (args->elapsed, start, stop);
+}
+
+static malloc_args tests[2][NUM_SIZES][NUM_ALLOCS];
+static int allocs[NUM_ALLOCS] = { 25, 100, 400, MAX_ALLOCS };
+static size_t sizes[NUM_SIZES] = { 16, 256, 1024, 4096 };
+
+static void *
+dummy (void *p)
+{
+  return p;
+}
+
+int
+main (int argc, char **argv)
+{
+  size_t iters = NUM_ITERS;
+  int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
+  unsigned long res;
+
+  TIMING_INIT (res);
+  (void) res;
+
+  for (int t = 0; t < 2; t++)
+    for (int j = 0; j < NUM_SIZES; j++)
+      for (int i = 0; i < NUM_ALLOCS; i++)
+	{
+          tests[t][j][i].n = allocs[i];
+	  tests[t][j][i].size = sizes[j];
+	  tests[t][j][i].iters = iters / allocs[i];
+
+	  /* Do a quick warmup run.  */
+	  if (t == 0)
+	    do_benchmark (&tests[0][j][i], arr);
+	}
+
+  /* Run benchmark single threaded.  */
+  for (int j = 0; j < NUM_SIZES; j++)
+    for (int i = 0; i < NUM_ALLOCS; i++)
+      do_benchmark (&tests[0][j][i], arr);
+
+  /* Create an empty thread so SINGLE_THREAD_P becomes false.  */
+  pthread_t t;
+  pthread_create(&t, NULL, dummy, NULL);
+  pthread_join(t, NULL);
+
+  /* Repeat benchmark with SINGLE_THREAD_P == false.  */
+  for (int j = 0; j < NUM_SIZES; j++)
+    for (int i = 0; i < NUM_ALLOCS; i++)
+      do_benchmark (&tests[1][j][i], arr);
+
+  free (arr);
+
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+
+  json_attr_object_begin (&json_ctx, "malloc");
+
+  for (int j = 0; j < NUM_SIZES; j++)
+    {
+      char s[100];
+      double iters2 = iters;
+      sprintf (s, "malloc_block_size_%04ld", sizes[j]);
+      json_attr_object_begin (&json_ctx, s);
+
+      for (int i = 0; i < NUM_ALLOCS; i++)
+	{
+	  sprintf (s, "st_num_allocs_%04d_time", allocs[i]);
+	  json_attr_double (&json_ctx, s, tests[0][j][i].elapsed / iters2);
+	}
+
+      for (int i = 0; i < NUM_ALLOCS; i++)
+        {
+          sprintf (s, "mt_num_allocs_%04d_time", allocs[i]);
+          json_attr_double (&json_ctx, s, tests[1][j][i].elapsed / iters2);
+        }
+
+      json_attr_object_end (&json_ctx);
+    }
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_document_end (&json_ctx);
+  return 0;
+}


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2017-12-01 13:51 Wilco Dijkstra
@ 2017-12-01 16:13 ` Carlos O'Donell
  2017-12-18 15:18   ` Wilco Dijkstra
  0 siblings, 1 reply; 15+ messages in thread
From: Carlos O'Donell @ 2017-12-01 16:13 UTC (permalink / raw)
  To: Wilco Dijkstra, libc-alpha@sourceware.org; +Cc: nd

On 12/01/2017 05:51 AM, Wilco Dijkstra wrote:
> Add a malloc micro benchmark to enable accurate testing of the
> various paths in malloc and free.  The benchmark does a varying
> number of allocations of a given block size, then frees them again.
> It does so for several block sizes and number of allocated blocks.
> Although the test is single-threaded, it also tests what happens
> when you disable single-threaded fast paths (ie. SINGLE_THREAD_P
> is false).
> 
> OK for commit?

High level:

This test is a long time coming and is a great idea.

My "big" question here is: What are we trying to model?

Do we want to prove that the single threaded optimizations you
added are helping a given size class of allocations?

You are currently modeling a workload that has increasing
memory size requests and in some ways this is an odd workload
that has high external fragmentation characteristics. For example
after allocating lots of 256 byte blocks we move on to 1024 byte
blocks, with the latter being unusable unless we coalesce.

We need to discuss what we want to model here, and I mention that
below in my larger comment.

I have no objection to modeling what you have here, but if we do
this then you need a big paragraph of text explaining why this
particular case is important to you at ARM, or to yourself as a
developer.

Design:

Overall this looks good.

I like that you test with just one thread and then again after
having created a thread.

I *wish* we could test main_arena vs. threaded arena, since they
have different code and behave differently e.g. sbrk vs. mmap'd
heap.

Implementation:

You need to make this robust against env vars changing malloc
behaviour. You should use mallopt to change some parameters.

> Typical output on an x64 box:
> {
>  "timing_type": "hp_timing",
>  "functions": {
>   "malloc": {
>    "malloc_block_size_0016": {
>     "st_num_allocs_0025_time": 53.5486,
>     "st_num_allocs_0100_time": 57.2553,
>     "st_num_allocs_0400_time": 57.3204,
>     "st_num_allocs_1000_time": 57.2059,
>     "mt_num_allocs_0025_time": 87.7903,
>     "mt_num_allocs_0100_time": 100.772,
>     "mt_num_allocs_0400_time": 103.827,
>     "mt_num_allocs_1000_time": 104.812
>    },
>    "malloc_block_size_0256": {
>     "st_num_allocs_0025_time": 78.3056,
>     "st_num_allocs_0100_time": 85.6392,
>     "st_num_allocs_0400_time": 91.5187,
>     "st_num_allocs_1000_time": 163.458,
>     "mt_num_allocs_0025_time": 115.925,
>     "mt_num_allocs_0100_time": 140.735,
>     "mt_num_allocs_0400_time": 152.044,
>     "mt_num_allocs_1000_time": 225.118
>    },
>    "malloc_block_size_1024": {
>     "st_num_allocs_0025_time": 113.705,
>     "st_num_allocs_0100_time": 103.79,
>     "st_num_allocs_0400_time": 479.029,
>     "st_num_allocs_1000_time": 634.228,
>     "mt_num_allocs_0025_time": 145.807,
>     "mt_num_allocs_0100_time": 151.157,
>     "mt_num_allocs_0400_time": 526.499,
>     "mt_num_allocs_1000_time": 687.357
>    },
>    "malloc_block_size_4096": {
>     "st_num_allocs_0025_time": 105.101,
>     "st_num_allocs_0100_time": 1640.23,
>     "st_num_allocs_0400_time": 2411.26,
>     "st_num_allocs_1000_time": 2641.56,
>     "mt_num_allocs_0025_time": 156.323,
>     "mt_num_allocs_0100_time": 1702.94,
>     "mt_num_allocs_0400_time": 2453,
>     "mt_num_allocs_1000_time": 2676.75
>    }
>   }
>  }
> }
> 
> Note something very bad happens for the larger allocations, there
> is a 25x slowdown from 25 to 400 allocations of 4KB blocks...

Keep in mind you are testing the performance of sbrk here. In a threaded
arena, the non-main_arena mmap's a 64MiB heap (on 64-bit) and then
draws allocations from it. So in some ways main_arena is expenseive,
but both have to pay a page-touch cost...

For each 4KiB block you touch the block to write the co-located metadata
and that forces the kernel to give you a blank page, which you then do 
nothing with. Then you repeat the above again.

For all other sizes you amortize the cost of the new page among
several allocations.

Do you have any other explanation?

At some point you will hit the mmap threshold and the cost of the
allocation will skyrocket as you have to call mmap.

> ChangeLog:
> 2017-12-01  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* benchtests/Makefile: Add malloc-simple benchmark.
> 	* benchtests/bench-malloc-simple.c: New benchmark.
> --
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index d8681fce8cf399bc655f3f6a7717897eb9c30619..a4b2573cfa706bd6369063a995d512e0947c7bd5 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -67,8 +67,10 @@ stdio-common-benchset := sprintf
>  
>  math-benchset := math-inlines
>  
> +malloc-benchset := malloc-simple
> +

OK.

>  benchset := $(string-benchset-all) $(stdlib-benchset) $(stdio-common-benchset) \
> -	    $(math-benchset)
> +	    $(math-benchset) $(malloc-benchset)

OK.

>  
>  CFLAGS-bench-ffs.c += -fno-builtin
>  CFLAGS-bench-ffsll.c += -fno-builtin
> @@ -86,6 +88,7 @@ $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
>  $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
>  $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
>  $(objpfx)bench-malloc-thread: $(shared-thread-library)
> +$(objpfx)bench-malloc-simple: $(shared-thread-library)

OK.

>  
>  \f
>  
> diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e786ddd9635f835b2f01b00a80f3cf0d2de82d48
> --- /dev/null
> +++ b/benchtests/bench-malloc-simple.c
> @@ -0,0 +1,152 @@
> +/* Benchmark malloc and free functions.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <pthread.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include "bench-timing.h"
> +#include "json-lib.h"
> +
> +#define NUM_ITERS 1000000
> +#define NUM_ALLOCS 4
> +#define NUM_SIZES  4
> +#define MAX_ALLOCS 1000
> +
> +typedef struct
> +{
> +  size_t iters;
> +  size_t size;
> +  int n;
> +  timing_t elapsed;
> +} malloc_args;
> +
> +static void
> +do_benchmark (malloc_args *args, int **arr)
> +{
> +  timing_t start, stop;
> +  size_t iters = args->iters;
> +  size_t size = args->size;
> +  int n = args->n;
> +
> +  TIMING_NOW (start);
> +
> +  for (int j = 0; j < iters; j++)
> +    {
> +      for (int i = 0; i < n; i++)
> +        arr[i] = malloc (size);
> +
> +      for (int i = 0; i < n; i++)
> +        free (arr[i]);
> +    }
> +
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (args->elapsed, start, stop);
> +}
> +
> +static malloc_args tests[2][NUM_SIZES][NUM_ALLOCS];
> +static int allocs[NUM_ALLOCS] = { 25, 100, 400, MAX_ALLOCS };
> +static size_t sizes[NUM_SIZES] = { 16, 256, 1024, 4096 };

In glibc we have:

tcache -> fastbins -> smallbins -> largbing -> unordered -> mmap

If you proceed through from small allocations to larger allocations
you will create chunks that cannot be used by future allocations.
In many cases this is a worst case performance bottleneck. The
heap will contain many 256 byte allocations but these cannot service
the 1024 bytes, that is unless consolidation has been run. So this
tests the consolidation as much as anything else, which might not
trigger because of the free thresholds required.

So what are we trying to model here?

If we want to look at the cost of independent size class allocations
then we need a clean process and allocate only a given size, and look
at performance across the number of allocations.

In which case we need to do:

* Spawn new process with size as an argument.
* Have the new process track performance at N allocations of the
  same size.
* Record result.
* Increase size.
* Repeat.

This way each new spawned subprocess is "clean" and we exercise
a particular size class of allocations.

I would also have much finer grained allocations by powers of 2.
2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4092 etc. You want
to see what happens for the allocations which are:

* Less than a chunk size.
* Fit in the tcache?
* Fit in the fastbins?
* Fit in the smallbins?
* Fit in the fastbins?
* Fit only in mmap? (allocation specifically larger than a
  MALLOC_MMAP_THRESHOLD_ you set).

Would this serve better to show that your single threaded malloc
changes were helpful for a given size class?

> +
> +static void *
> +dummy (void *p)
> +{
> +  return p;
> +}
> +
> +int
> +main (int argc, char **argv)
> +{
> +  size_t iters = NUM_ITERS;
> +  int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
> +  unsigned long res;
> +

You need to use mallopt to make sure the user's environment
did not set MALLOC_MMAP_THRESHOLD_ to a value lower than your
maximum allocation size.

Similarly any other mallopt parameter you think is important
needs to be set.

If you spawned a clean subprocess you could clean the env var,
and that might actually be easier from a maintenance perspective.

> +  TIMING_INIT (res);
> +  (void) res;
> +
> +  for (int t = 0; t < 2; t++)
> +    for (int j = 0; j < NUM_SIZES; j++)
> +      for (int i = 0; i < NUM_ALLOCS; i++)
> +	{
> +          tests[t][j][i].n = allocs[i];
> +	  tests[t][j][i].size = sizes[j];
> +	  tests[t][j][i].iters = iters / allocs[i];
> +
> +	  /* Do a quick warmup run.  */
> +	  if (t == 0)
> +	    do_benchmark (&tests[0][j][i], arr);
> +	}
> +
> +  /* Run benchmark single threaded.  */
> +  for (int j = 0; j < NUM_SIZES; j++)
> +    for (int i = 0; i < NUM_ALLOCS; i++)
> +      do_benchmark (&tests[0][j][i], arr);
> +
> +  /* Create an empty thread so SINGLE_THREAD_P becomes false.  */
> +  pthread_t t;
> +  pthread_create(&t, NULL, dummy, NULL);
> +  pthread_join(t, NULL);
> +
> +  /* Repeat benchmark with SINGLE_THREAD_P == false.  */
> +  for (int j = 0; j < NUM_SIZES; j++)
> +    for (int i = 0; i < NUM_ALLOCS; i++)
> +      do_benchmark (&tests[1][j][i], arr);
> +
> +  free (arr);
> +
> +  json_ctx_t json_ctx;
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +
> +  json_attr_object_begin (&json_ctx, "malloc");
> +
> +  for (int j = 0; j < NUM_SIZES; j++)
> +    {
> +      char s[100];
> +      double iters2 = iters;
> +      sprintf (s, "malloc_block_size_%04ld", sizes[j]);
> +      json_attr_object_begin (&json_ctx, s);
> +
> +      for (int i = 0; i < NUM_ALLOCS; i++)
> +	{
> +	  sprintf (s, "st_num_allocs_%04d_time", allocs[i]);
> +	  json_attr_double (&json_ctx, s, tests[0][j][i].elapsed / iters2);
> +	}
> +
> +      for (int i = 0; i < NUM_ALLOCS; i++)
> +        {
> +          sprintf (s, "mt_num_allocs_%04d_time", allocs[i]);
> +          json_attr_double (&json_ctx, s, tests[1][j][i].elapsed / iters2);
> +        }
> +
> +      json_attr_object_end (&json_ctx);
> +    }
> +
> +  json_attr_object_end (&json_ctx);
> +
> +  json_attr_object_end (&json_ctx);
> +
> +  json_document_end (&json_ctx);
> +  return 0;
> +}
> 


-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2017-12-01 16:13 ` Carlos O'Donell
@ 2017-12-18 15:18   ` Wilco Dijkstra
  2017-12-18 16:32     ` Carlos O'Donell
  2017-12-18 23:02     ` DJ Delorie
  0 siblings, 2 replies; 15+ messages in thread
From: Wilco Dijkstra @ 2017-12-18 15:18 UTC (permalink / raw)
  To: Carlos O'Donell, libc-alpha@sourceware.org; +Cc: nd

Carlos O'Donell wrote:

Thanks for the review!

> This test is a long time coming and is a great idea.
>
> My "big" question here is: What are we trying to model?
>
> Do we want to prove that the single threaded optimizations you
> added are helping a given size class of allocations?

Yes that is the main goal of the benchmark. It models the allocation
pattern of a few benchmarks which were reported as being slow
despite the new tcache (which didn't show any gains).

When the tcache was configured to be larger there was a major
speedup, suggesting that the tcache doesn't work on patterns with
a high number of (de)allocations of similar sized blocks. Since DJ
didn't seem keen on increasing the tcache size despite it showing
major gains across a wide range of benchmarks, I decided to fix
the performance for the single-threaded case at least. It's now 2.5x
faster on a few sever benchmarks (of course the next question is
whether tcache is actually useful in its current form).

> You are currently modeling a workload that has increasing
> memory size requests and in some ways this is an odd workload
> that has high external fragmentation characteristics. For example
> after allocating lots of 256 byte blocks we move on to 1024 byte
> blocks, with the latter being unusable unless we coalesce.

I'm assuming coalescing works as expected. If it doesn't, it would
be a nasty bug.

> I *wish* we could test main_arena vs. threaded arena, since they
> have different code and behave differently e.g. sbrk vs. mmap'd
> heap.

I'd have to check how easy it is to force it to use the thread arena.
The whole thing is just crazily weird, with too many different code
paths and possibilities. It seems much easier just to always use
thread arenas, and perhaps use sbrk only if there is some serious
advantage over mmap. Also it appears all the values are set to
what was perhaps reasonable 10-20 years ago, not today. When
a small server has 128GB, there is absolutely no reason to worry
about returning 128KB to the OS as quickly as possible...

> Implementation:
>
> You need to make this robust against env vars changing malloc
> behaviour. You should use mallopt to change some parameters.

You mean setting the tcache size explicitly (maybe even switching off)?

>> Note something very bad happens for the larger allocations, there
>> is a 25x slowdown from 25 to 400 allocations of 4KB blocks...
>
> Keep in mind you are testing the performance of sbrk here. In a threaded
> arena, the non-main_arena mmap's a 64MiB heap (on 64-bit) and then
> draws allocations from it. So in some ways main_arena is expenseive,
> but both have to pay a page-touch cost...
>
> For each 4KiB block you touch the block to write the co-located metadata
> and that forces the kernel to give you a blank page, which you then do 
> nothing with. Then you repeat the above again.
>
> For all other sizes you amortize the cost of the new page among
> several allocations.
> 
> Do you have any other explanation?

Well that looks like a reasonable explanation, but it shows a serious
performance bug - I think we use MADV_DONTNEED which doesn't
work on Linux and will cause all pages to be deallocated, reallocated
and zero-filled... This is the sort of case where you need to be very
careful to amortize over many allocations or long elapsed time, if at
all (many other allocators never give pages back).

> At some point you will hit the mmap threshold and the cost of the
> allocation will skyrocket as you have to call mmap.

That only happens on huge allocations (much larger than 4KB), or when
you run out of sbrk space (unlikely).

> In glibc we have:
>
> tcache -> fastbins -> smallbins -> largbing -> unordered -> mmap
>
> If you proceed through from small allocations to larger allocations
> you will create chunks that cannot be used by future allocations.
> In many cases this is a worst case performance bottleneck. The
> heap will contain many 256 byte allocations but these cannot service
> the 1024 bytes, that is unless consolidation has been run. So this
> tests the consolidation as much as anything else, which might not
> trigger because of the free thresholds required.

If consolidation doesn't work that's a serious bug. However allocation
performance should not be affected either way - in a real application
those small blocks might still be allocated. As long as consolidation
runs quickly (generally it's a small percentage in profiles), it won't
affect the results.

> So what are we trying to model here?
>
> If we want to look at the cost of independent size class allocations
> then we need a clean process and allocate only a given size, and look
> at performance across the number of allocations.

That's certainly feasible if we keep the number of sizes small (less
than the list below). It should be easy to reuse the bench-malloc-thread.c
makefile magic to run the same binary with multiple sizes.

> I would also have much finer grained allocations by powers of 2.
> 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4092 etc. You want
> to see what happens for the allocations which are:
..
> Would this serve better to show that your single threaded malloc
> changes were helpful for a given size class?

Well I can easily add some of the above sizes, it's highly configurable.
I don't think there will be much difference with the existing sizes though.

> You need to use mallopt to make sure the user's environment
> did not set MALLOC_MMAP_THRESHOLD_ to a value lower than your
> maximum allocation size.

I don't think that is possible given the largest allocation size is 4KB.

Wilco

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2017-12-18 15:18   ` Wilco Dijkstra
@ 2017-12-18 16:32     ` Carlos O'Donell
  2017-12-18 23:02     ` DJ Delorie
  1 sibling, 0 replies; 15+ messages in thread
From: Carlos O'Donell @ 2017-12-18 16:32 UTC (permalink / raw)
  To: Wilco Dijkstra, libc-alpha@sourceware.org; +Cc: nd

On 12/18/2017 07:18 AM, Wilco Dijkstra wrote:
> Carlos O'Donell wrote:
> 
> Thanks for the review!

Thank you for the detailed follow up.

>> This test is a long time coming and is a great idea.
>>
>> My "big" question here is: What are we trying to model?
>>
>> Do we want to prove that the single threaded optimizations you
>> added are helping a given size class of allocations?
> 
> Yes that is the main goal of the benchmark. It models the allocation
> pattern of a few benchmarks which were reported as being slow
> despite the new tcache (which didn't show any gains).

OK.

> When the tcache was configured to be larger there was a major
> speedup, suggesting that the tcache doesn't work on patterns with
> a high number of (de)allocations of similar sized blocks. Since DJ
> didn't seem keen on increasing the tcache size despite it showing
> major gains across a wide range of benchmarks, I decided to fix
> the performance for the single-threaded case at least. It's now 2.5x
> faster on a few sever benchmarks (of course the next question is
> whether tcache is actually useful in its current form).

If you have a pattern of malloc/free of *similar* sized blocks, then
it overflows the sized bin in the tcache, with other size bins remaining
empty. The cache itself does not dynamically reconfigure itself to consume
X MiB or Y % of RSS, instead it uses a simple data structure to contain
a fixed number of fixed size blocks.

Therefore I agree, that enhancing the core data structure in tcache may
result in better overall performance, particularly if we got rid of the
fixed bin sizes and instead found a way to be performant *and* keep a
running total of consumption.

This is not a trivial goal though.

Likewise *all* of malloc needs to be moved to a better data structure than
just linked lists. I would like to see glibc's malloc offer a cacheing
footprint of no more than Y % of RSS available, and let the user tweak that.
Currently we just consume RSS without much regard for overhead. Though this
is a different case than than what you are talking about, the changes are
related via data-structure enhancements that would benefit both cases IMO.

>> You are currently modeling a workload that has increasing
>> memory size requests and in some ways this is an odd workload
>> that has high external fragmentation characteristics. For example
>> after allocating lots of 256 byte blocks we move on to 1024 byte
>> blocks, with the latter being unusable unless we coalesce.
> 
> I'm assuming coalescing works as expected. If it doesn't, it would
> be a nasty bug.

You are probably right.

>> I *wish* we could test main_arena vs. threaded arena, since they
>> have different code and behave differently e.g. sbrk vs. mmap'd
>> heap.
> 
> I'd have to check how easy it is to force it to use the thread arena.
> The whole thing is just crazily weird, with too many different code
> paths and possibilities. It seems much easier just to always use
> thread arenas, and perhaps use sbrk only if there is some serious
> advantage over mmap. Also it appears all the values are set to
> what was perhaps reasonable 10-20 years ago, not today. When
> a small server has 128GB, there is absolutely no reason to worry
> about returning 128KB to the OS as quickly as possible...

(a) Returning memory based on a limit of memory cached.

The decision to return memory to the operating system should be based
on a desire to run within the bounds of a certain amount of cached
memory in the user process.

This should be the goal IMO. We should not return 128KB to the OS unless
we are within our bounds of Y % of RSS cache, or X MiB of RSS cache.
This bounded behaviour is more and more important for (b).

So I argue that this has nothing to do with how much memory the server
has but how much the user wants as cache in the process. This gets back
to your point about tcache size needing to be bigger; if you had Y % RSS
allocated to tcache it would solve your needs.

(b) Packing density matters, or rather consistent RSS usage matters.

Yes, and no. We are facing a lot of downstream request for container,
and VM packing efficiency. This means that your 128GB is split into
32 servers each with 4GB, or 64 servers each with 2GB running smaller
services. In these cases we *do* care a lot about packing density.

(b) Maintenance costs of the existing weird cases and harmonizing threaded
    and main_arena paths.

As I suggested in bug 15321:
https://sourceware.org/bugzilla/show_bug.cgi?id=15321

We need to merge the main_arena and threaded code together, and stop
treating them as different things. Right now the main_arena, if you
look at the code, is a *pretend* heap with a partial data structure
layered in place. This needs to go away. We need to treat all heaps
as identical, with identical code paths, with just different backing
storage.

I think people still expect that thread 0 allocates from the sbrk
heap in a single-threaded application, and we can do that by ensuring
sbrk is used to provide the backing store for the main thread. This way
we can jump the pointer 64MB like we normally do for mmap'd heaps, but
then on page touch there the kernel just extends the heap normally.
No difference (except VMA usage).

Once that is in place we can experiment with other strategies like never
using sbrk.

>> Implementation:
>>
>> You need to make this robust against env vars changing malloc
>> behaviour. You should use mallopt to change some parameters.
> 
> You mean setting the tcache size explicitly (maybe even switching off)?

You have several options:

* Add a wrapper script that clear all mallopt related env vars.
* Adjust the Makefile to clear all mallopt related env vars before starting
  the test.
* Set tcache sizes explicitly *if* that is what you want, but likely you
  don't want this and want to run the test with just the defaults to see
  how the defaults are performing.

>>> Note something very bad happens for the larger allocations, there
>>> is a 25x slowdown from 25 to 400 allocations of 4KB blocks...
>>
>> Keep in mind you are testing the performance of sbrk here. In a threaded
>> arena, the non-main_arena mmap's a 64MiB heap (on 64-bit) and then
>> draws allocations from it. So in some ways main_arena is expenseive,
>> but both have to pay a page-touch cost...
>>
>> For each 4KiB block you touch the block to write the co-located metadata
>> and that forces the kernel to give you a blank page, which you then do 
>> nothing with. Then you repeat the above again.
>>
>> For all other sizes you amortize the cost of the new page among
>> several allocations.
>>
>> Do you have any other explanation?
> 
> Well that looks like a reasonable explanation, but it shows a serious
> performance bug - I think we use MADV_DONTNEED which doesn't
> work on Linux and will cause all pages to be deallocated, reallocated
> and zero-filled... This is the sort of case where you need to be very
> careful to amortize over many allocations or long elapsed time, if at
> all (many other allocators never give pages back).

We need to move to MADV_FREE, which was designed for memory allocators.

The semantics of MADV_DONTNEED have the problem that one has to consider:
* Is the data destructively lost in that page?
* Is the data flushed to the underlying store before being not-needed?
All of which lead to MADV_DONTNEED doing a lot of teardown work to ensure
that users don't corrupt the data in their backing stores.

I think that detection of MADV_FREE, and usage, would help performance,
but only on > Linux 4.5, and that might be OK for you.

>> At some point you will hit the mmap threshold and the cost of the
>> allocation will skyrocket as you have to call mmap.
> 
> That only happens on huge allocations (much larger than 4KB), or when
> you run out of sbrk space (unlikely).

It happens at the mmap threshold, which is variable :-)

Please consider the implementation as a fluid set of parameters that
model application behaviour.

We can run out of sbrk space *immediately* if you have an interposing
low-address mmap that means sbrk can't grow (again see swbz#15321).

Right now the mmap threshold is 128KiB though, so you're right, for
the default. I don't know if that size is a good idea or not.

>> In glibc we have:
>>
>> tcache -> fastbins -> smallbins -> largbing -> unordered -> mmap
>>
>> If you proceed through from small allocations to larger allocations
>> you will create chunks that cannot be used by future allocations.
>> In many cases this is a worst case performance bottleneck. The
>> heap will contain many 256 byte allocations but these cannot service
>> the 1024 bytes, that is unless consolidation has been run. So this
>> tests the consolidation as much as anything else, which might not
>> trigger because of the free thresholds required.
> 
> If consolidation doesn't work that's a serious bug. However allocation
> performance should not be affected either way - in a real application
> those small blocks might still be allocated. As long as consolidation
> runs quickly (generally it's a small percentage in profiles), it won't
> affect the results.

OK.

>> So what are we trying to model here?
>>
>> If we want to look at the cost of independent size class allocations
>> then we need a clean process and allocate only a given size, and look
>> at performance across the number of allocations.
> 
> That's certainly feasible if we keep the number of sizes small (less
> than the list below). It should be easy to reuse the bench-malloc-thread.c
> makefile magic to run the same binary with multiple sizes.

OK.

>> I would also have much finer grained allocations by powers of 2.
>> 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4092 etc. You want
>> to see what happens for the allocations which are:
> ..
>> Would this serve better to show that your single threaded malloc
>> changes were helpful for a given size class?
> 
> Well I can easily add some of the above sizes, it's highly configurable.
> I don't think there will be much difference with the existing sizes though.

Perhaps, but I don't know the answer to that.

>> You need to use mallopt to make sure the user's environment
>> did not set MALLOC_MMAP_THRESHOLD_ to a value lower than your
>> maximum allocation size.
> 
> I don't think that is possible given the largest allocation size is 4KB.

We carry out the allocation with mmap regardless, rounding up the size to
that of a page.

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2017-12-18 15:18   ` Wilco Dijkstra
  2017-12-18 16:32     ` Carlos O'Donell
@ 2017-12-18 23:02     ` DJ Delorie
  2017-12-28 14:09       ` Wilco Dijkstra
  1 sibling, 1 reply; 15+ messages in thread
From: DJ Delorie @ 2017-12-18 23:02 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: carlos, libc-alpha, nd

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Since DJ didn't seem keen on increasing the tcache size despite it
> showing major gains across a wide range of benchmarks,

It's not that I'm not keen on increasing the size, it's that there are
drawbacks to doing so and I don't want to base such a change on a guess
(even a good guess).  If you have benchmarks, let's collect them and add
them to the trace corpus.  I can send you my corpus.  (We don't have a
good solution for centrally storing such a corpus, yet) Let's run all
the tests against all the options and make an informed decision, that's
all.  If it shows gains for synthetic benchmarks, but makes qemu slower,
we need to know that.

Also, as Carlos noted, there are some downstream uses where a larger
cache may be detrimental.  Sometimes there are no universally "better"
defaults, and we provide tunables for those cases.

And, as always, I can be out-voted if the consensus disagrees with me ;-)

> I decided to fix the performance for the single-threaded case at
> least. It's now 2.5x faster on a few sever benchmarks (of course the
> next question is whether tcache is actually useful in its current
> form).

Again, tcache is intended to help the multi-threaded case.  Your patches
help the single-threaded case.  If you recall, I ran your patch against
my corpus of multi-threaded tests, and saw no regressions, which is
good.

So our paranoia here is twofold...

1. Make sure that when someone says "some benchmarks" we have those
   benchmarks available to us, either as a microbenchmark in glibc or as
   a trace we can simulate and benchmark.  No more random benchmarks! :-)

2. When we say a patch "is faster", let's run all our benchmarks and
   make sure that we don't mean "on some benchmarks."  The whole point
   of the trace/sim stuff is to make sure key downstream users aren't
   left out of the optimization work, and end up with worse performance.

We probably should add "on all major architectures" too but that assumes
we have machines on which we can run the benchmarks.

So we should be able to answer your question, not just wonder...

> I'd have to check how easy it is to force it to use the thread arena.

I'm guessing we could have a glibc-internal API to tag the heap as
"corrupt" which would preclude using it.

> If consolidation doesn't work that's a serious bug.

Sometimes it's not a case of "doesn't work" as a case of "not attempted
for performance reasons".  If we can show that a different design choice
is universally better[*], we should change it.

[*] or at least, universally-enough for a "system" allocator like glibc
    must provide.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2017-12-18 23:02     ` DJ Delorie
@ 2017-12-28 14:09       ` Wilco Dijkstra
  2017-12-28 19:01         ` DJ Delorie
  0 siblings, 1 reply; 15+ messages in thread
From: Wilco Dijkstra @ 2017-12-28 14:09 UTC (permalink / raw)
  To: DJ Delorie; +Cc: carlos@redhat.com, libc-alpha@sourceware.org, nd

DJ Delorie wrote:
> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> > Since DJ didn't seem keen on increasing the tcache size despite it
> > showing major gains across a wide range of benchmarks,
>
> It's not that I'm not keen on increasing the size, it's that there are
> drawbacks to doing so and I don't want to base such a change on a guess
> (even a good guess).  If you have benchmarks, let's collect them and add
> them to the trace corpus.  I can send you my corpus.  (We don't have a
> good solution for centrally storing such a corpus, yet) Let's run all
> the tests against all the options and make an informed decision, that's
> all.  If it shows gains for synthetic benchmarks, but makes qemu slower,
> we need to know that.

Yes I'd be interested in the traces. I presume they are ISA independent and
can just be replayed?

> Also, as Carlos noted, there are some downstream uses where a larger
> cache may be detrimental.  Sometimes there are no universally "better"
> defaults, and we provide tunables for those cases.

It depends. I've seen cases where returning pages to the OS too quickly
causes a huge performance loss. I think in many of these cases we can
be far smarter and use adaptive algorithms. If say 50% of your memory
ends up in the tcache and you can't allocate a new block, it seems a good
idea to consolidate first. If it's less than 1%, why worry about it?

So short term there may be simple ways to tune tcache, eg. allow a larger
number of small blocks (trivial change), or limit total bytes in the tcache
(which could be dynamically increased as more memory is allocated).

Longer term we need to make arena's per-thread - see below.

> Again, tcache is intended to help the multi-threaded case.  Your patches
> help the single-threaded case.  If you recall, I ran your patch against
> my corpus of multi-threaded tests, and saw no regressions, which is
> good.

Arenas are already mostly per-thread. My observation was that the gains
from tcache are due to bypassing completely uncontended locks.
If an arena could be marked as owned by a thread, the fast single-threaded
paths could be used all of the time (you'd have to handle frees from other
threads of course but those could go in a separate bin for consolidation).

> So our paranoia here is twofold...
>
> 1. Make sure that when someone says "some benchmarks" we have those
>    benchmarks available to us, either as a microbenchmark in glibc or as
>    a trace we can simulate and benchmark.  No more random benchmarks! :-)

Agreed, it's quite feasible to create more traces and more microbenchmarks.

> 2. When we say a patch "is faster", let's run all our benchmarks and
>    make sure that we don't mean "on some benchmarks."  The whole point
>    of the trace/sim stuff is to make sure key downstream users aren't
>    left out of the optimization work, and end up with worse performance.

Well you can't expect gains on all benchmarks or have a "never regress
anything ever" rule. Minor changes in alignment of a heap block or allocation
of pages from the OS can have a large performance impact that's hard to
control. The smallest possible RSS isn't always better. The goal should be to
improve average performance across a wide range of applications.

> We probably should add "on all major architectures" too but that assumes
> we have machines on which we can run the benchmarks.

Szabolcs or I would be happy to run the traces on AArch64.

Wilco

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2017-12-28 14:09       ` Wilco Dijkstra
@ 2017-12-28 19:01         ` DJ Delorie
  0 siblings, 0 replies; 15+ messages in thread
From: DJ Delorie @ 2017-12-28 19:01 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: carlos, libc-alpha, nd

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Yes I'd be interested in the traces. I presume they are ISA independent and
> can just be replayed?

Yup, they're ISA-agnostic compressed pseudo-codes that run a state
machine.  I put them here (except the two biggest, which take days to
transfer):

http://www.delorie.com/malloc/

Be kind, that's my house's bandwidth ;-)

The simulator is in the dj/malloc branch, in malloc/trace_run.c

> I think in many of these cases we can be far smarter and use adaptive
> algorithms.

I'm wary of "smart" algorithms because it's so hard to generalize them
to work sufficiently well in all cases, without accidentally causing
super-stupid behavior in some unusual app.

But yeah :-)

> Longer term we need to make arena's per-thread - see below.

They're sort of per-thread now, but a lot of apps pass malloc'd memory
between threads.  A strict one-per-thread isn't optimal either.

> My observation was that the gains from tcache are due to bypassing
> completely uncontended locks.

Yup.

> If an arena could be marked as owned by
> a thread,

That mark would need to be an uncontended lock...

>> 2. When we say a patch "is faster", let's run all our benchmarks and
>>    make sure that we don't mean "on some benchmarks."  The whole point
>>    of the trace/sim stuff is to make sure key downstream users aren't
>>    left out of the optimization work, and end up with worse performance.
>
> Well you can't expect gains on all benchmarks or have a "never regress
> anything ever" rule.

Sure, I just don't want surprises.  Also, benchmarks from the Real World
can be prioritized - a patch which improves a synthetic benchmark but
makes chrome worse should be rejected.  A generally good patch that
makes a few apps worse should at least be investigated to find out why.
Etc.

> Szabolcs or I would be happy to run the traces on AArch64.

That would be helpful :-)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] Add malloc micro benchmark
@ 2019-02-01 16:27 Wilco Dijkstra
  2019-02-08 19:37 ` DJ Delorie
  2019-02-28  4:52 ` Carlos O'Donell
  0 siblings, 2 replies; 15+ messages in thread
From: Wilco Dijkstra @ 2019-02-01 16:27 UTC (permalink / raw)
  To: 'GNU C Library'; +Cc: nd

Add a malloc micro benchmark to enable accurate testing of the
various paths in malloc and free.  The benchmark does a varying
number of allocations of a given block size, then frees them again.

It tests 3 different scenarios: single-threaded using main arena,
multi-threaded using thread-arena, main arena with SINGLE_THREAD_P
false.

OK for commit?

ChangeLog:
2019-02-01  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/Makefile: Add malloc-simple benchmark.
	* benchtests/bench-malloc-simple.c: New benchmark.

--
diff --git a/benchtests/Makefile b/benchtests/Makefile
index 12036b1935dc7ea84b421f024d6fe3190ae35a6e..09f7cb8e475a312268eebb4d346edde70d22bb3d 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -90,7 +90,7 @@ CFLAGS-bench-trunc.c += -fno-builtin
 CFLAGS-bench-truncf.c += -fno-builtin
 
 ifeq (${BENCHSET},)
-bench-malloc := malloc-thread
+bench-malloc := malloc-thread malloc-simple
 else
 bench-malloc := $(filter malloc-%,${BENCHSET})
 endif
@@ -98,7 +98,7 @@ endif
 $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
 $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
-$(objpfx)bench-malloc-thread: $(shared-thread-library)
+$(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library)
 
 \f
 
@@ -165,7 +165,7 @@ bench-clean:
 ifneq ($(strip ${BENCHSET}),)
 VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
    wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread
+   malloc-thread malloc-simple
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
@@ -194,10 +194,18 @@ bench-set: $(binaries-benchset)
 
 bench-malloc: $(binaries-bench-malloc)
 	for run in $^; do \
+	  echo "$${run}"; \
+	  if [ `basename $${run}` = "bench-malloc-thread" ]; then \
 		for thr in 1 8 16 32; do \
 			echo "Running $${run} $${thr}"; \
-	  $(run-bench) $${thr} > $${run}-$${thr}.out; \
-	  done;\
+			$(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  else \
+		for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \
+		  echo "Running $${run} $${thr}"; \
+		  $(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  fi;\
 	done
 
 # Build and execute the benchmark functions.  This target generates JSON
diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
new file mode 100644
index 0000000000000000000000000000000000000000..995d78965fd65fdf1c84cf85bf38990cd49402b3
--- /dev/null
+++ b/benchtests/bench-malloc-simple.c
@@ -0,0 +1,182 @@
+/* Benchmark malloc and free functions.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <sys/resource.h>
+#include "bench-timing.h"
+#include "json-lib.h"
+
+#define NUM_ITERS 1000000
+#define NUM_ALLOCS 4
+#define MAX_ALLOCS 1600
+
+typedef struct
+{
+  size_t iters;
+  size_t size;
+  int n;
+  timing_t elapsed;
+} malloc_args;
+
+static void
+do_benchmark (malloc_args *args, int **arr)
+{
+  timing_t start, stop;
+  size_t iters = args->iters;
+  size_t size = args->size;
+  int n = args->n;
+
+  TIMING_NOW (start);
+
+  for (int j = 0; j < iters; j++)
+    {
+      for (int i = 0; i < n; i++)
+	arr[i] = malloc (size);
+
+      for (int i = 0; i < n; i++)
+	free (arr[i]);
+    }
+
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (args->elapsed, start, stop);
+}
+
+static malloc_args tests[3][NUM_ALLOCS];
+static int allocs[NUM_ALLOCS] = { 25, 100, 400, MAX_ALLOCS };
+
+static void *
+thread_test (void *p)
+{
+  int **arr = (int**)p;
+
+  /* Run benchmark multi-threaded.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[2][i], arr);
+
+  return p;
+}
+
+void
+bench (unsigned long size)
+{
+  size_t iters = NUM_ITERS;
+  int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
+  unsigned long res;
+
+  TIMING_INIT (res);
+
+  for (int t = 0; t <= 3; t++)
+    for (int i = 0; i < NUM_ALLOCS; i++)
+      {
+	tests[t][i].n = allocs[i];
+	tests[t][i].size = size;
+	tests[t][i].iters = iters / allocs[i];
+
+	/* Do a quick warmup run.  */
+	if (t == 0)
+	  do_benchmark (&tests[0][i], arr);
+      }
+
+  /* Run benchmark single threaded in main_arena.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[0][i], arr);
+
+  /* Run benchmark in a thread_arena.  */
+  pthread_t t;
+  pthread_create (&t, NULL, thread_test, (void*)arr);
+  pthread_join (t, NULL);
+
+  /* Repeat benchmark in main_arena with SINGLE_THREAD_P == false.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[1][i], arr);
+
+  free (arr);
+
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+
+  json_attr_object_begin (&json_ctx, "malloc");
+
+  char s[100];
+  double iters2 = iters;
+
+  json_attr_object_begin (&json_ctx, "");
+  json_attr_double (&json_ctx, "malloc_block_size", size);
+
+  struct rusage usage;
+  getrusage (RUSAGE_SELF, &usage);
+  json_attr_double (&json_ctx, "max_rss", usage.ru_maxrss);
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_st_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[0][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_mt_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[1][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "thread_arena__allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[2][i].elapsed / iters2);
+    }
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_document_end (&json_ctx);
+}
+
+static void usage (const char *name)
+{
+  fprintf (stderr, "%s: <alloc_size>\n", name);
+  exit (1);
+}
+
+int
+main (int argc, char **argv)
+{
+  long val = 16;
+  if (argc == 2)
+    val = strtol (argv[1], NULL, 0);
+
+  if (argc > 2 || val <= 0)
+    usage (argv[0]);
+
+  bench (val);
+
+  return 0;
+}

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2019-02-01 16:27 [PATCH] Add malloc micro benchmark Wilco Dijkstra
@ 2019-02-08 19:37 ` DJ Delorie
  2019-02-14 16:38   ` Wilco Dijkstra
  2019-02-28  4:52 ` Carlos O'Donell
  1 sibling, 1 reply; 15+ messages in thread
From: DJ Delorie @ 2019-02-08 19:37 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: libc-alpha, nd


Looks good to me, although I'd like some additional comments in the test
code.

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> -bench-malloc := malloc-thread
> +bench-malloc := malloc-thread malloc-simple

Adding a test, ok

> -$(objpfx)bench-malloc-thread: $(shared-thread-library)
> +$(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library)

Accepting a list of tests, ok

> -   malloc-thread
> +   malloc-thread malloc-simple

Adding a test, ok

>  bench-malloc: $(binaries-bench-malloc)
>  	for run in $^; do \
> +	  echo "$${run}"; \
> +	  if [ `basename $${run}` = "bench-malloc-thread" ]; then \
>  		for thr in 1 8 16 32; do \
>  			echo "Running $${run} $${thr}"; \
> -	  $(run-bench) $${thr} > $${run}-$${thr}.out; \
> -	  done;\
> +			$(run-bench) $${thr} > $${run}-$${thr}.out; \
> +		done;\
> +	  else \
> +		for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \
> +		  echo "Running $${run} $${thr}"; \
> +		  $(run-bench) $${thr} > $${run}-$${thr}.out; \
> +		done;\
> +	  fi;\
>  	done

I wonder if this could be done more elegantly, but I'm OK with a simple
approach for now.  If we end up adding many more such tests we might
need to revisit this part.

> +/* Benchmark malloc and free functions.
> +   Copyright (C) 2018 Free Software Foundation, Inc.

2019

> +
> +#include <pthread.h>

I would like to see a comment block somewhere in this code that
describes, to the casual future reader, what this test is looking for
and why it's different than other tests.  I won't hold up my OK for it,
though.

> +#define NUM_ITERS 1000000
> +#define NUM_ALLOCS 4
> +#define MAX_ALLOCS 1600

How long does this test take to run, on average, compared to other
tests?  Do we have to worry about increasing timeouts for slow hosts?

> +static void
> +do_benchmark (malloc_args *args, int **arr)
> +{
> +  timing_t start, stop;
> +  size_t iters = args->iters;
> +  size_t size = args->size;
> +  int n = args->n;
> +
> +  TIMING_NOW (start);
> +
> +  for (int j = 0; j < iters; j++)
> +    {
> +      for (int i = 0; i < n; i++)
> +	arr[i] = malloc (size);
> +
> +      for (int i = 0; i < n; i++)
> +	free (arr[i]);
> +    }
> +
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (args->elapsed, start, stop);
> +}

Simple loop, but doesn't test for malloc returning NULL.

> +  /* Run benchmark single threaded in main_arena.  */
> +  for (int i = 0; i < NUM_ALLOCS; i++)
> +    do_benchmark (&tests[0][i], arr);
> +
> +  /* Run benchmark in a thread_arena.  */
> +  pthread_t t;
> +  pthread_create (&t, NULL, thread_test, (void*)arr);
> +  pthread_join (t, NULL);
> +
> +  /* Repeat benchmark in main_arena with SINGLE_THREAD_P == false.  */
> +  for (int i = 0; i < NUM_ALLOCS; i++)
> +    do_benchmark (&tests[1][i], arr);

So we repeat the "main thread" case but now the heap is "messy" from the
now-joined thread... ok.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2019-02-08 19:37 ` DJ Delorie
@ 2019-02-14 16:38   ` Wilco Dijkstra
  2019-02-14 20:42     ` DJ Delorie
  0 siblings, 1 reply; 15+ messages in thread
From: Wilco Dijkstra @ 2019-02-14 16:38 UTC (permalink / raw)
  To: DJ Delorie; +Cc: libc-alpha@sourceware.org, nd

Hi DJ,

> Looks good to me, although I'd like some additional comments in the test
> code.

Thanks for the review - I've added some extra comments:

+/* Benchmark the malloc/free performance of a varying number of blocks of a
+   given size.  This enables performance tracking of the t-cache and fastbins.
+   It tests 3 different scenarios: single-threaded using main arena,
+   multi-threaded using thread-arena, and main arena with SINGLE_THREAD_P
+   false.  */

> +       else \
> +             for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \
> +               echo "Running $${run} $${thr}"; \
> +               $(run-bench) $${thr} > $${run}-$${thr}.out; \
> +             done;\
> +       fi;\
>        done

> I wonder if this could be done more elegantly, but I'm OK with a simple
> approach for now.  If we end up adding many more such tests we might
> need to revisit this part.

The main concern was to get a clean state so that the test of a previous block
size doesn't affect subsequent results.

> +#define NUM_ITERS 1000000
> +#define NUM_ALLOCS 4
> +#define MAX_ALLOCS 1600

> How long does this test take to run, on average, compared to other
> tests?  Do we have to worry about increasing timeouts for slow hosts?

All the tests together runs finish in a fraction of the time taken by a single
test of bench-malloc-thread, so if anything we need to reduce the time of
that one by an order of magnitude (it takes ~5 minutes!).

> +static void
> +do_benchmark (malloc_args *args, int **arr)
> +{
> +  timing_t start, stop;
> +  size_t iters = args->iters;
> +  size_t size = args->size;
> +  int n = args->n;
> +
> +  TIMING_NOW (start);
> +
> +  for (int j = 0; j < iters; j++)
> +    {
> +      for (int i = 0; i < n; i++)
> +     arr[i] = malloc (size);
> +
> +      for (int i = 0; i < n; i++)
> +     free (arr[i]);
> +    }
> +
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (args->elapsed, start, stop);
> +}

> Simple loop, but doesn't test for malloc returning NULL.

Yeah, the benchmark doesn't need to care since the amount we allocate
is tiny (6.4MBytes).

Cheers,
Wilco

I've committed this:

Add a malloc micro benchmark to enable accurate testing of the
various paths in malloc and free.  The benchmark does a varying
number of allocations of a given block size, then frees them again.

It tests 3 different scenarios: single-threaded using main arena,
multi-threaded using thread-arena, main arena with SINGLE_THREAD_P
false.

OK for commit?

ChangeLog:
2019-02-14  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/Makefile: Add malloc-simple benchmark.
	* benchtests/bench-malloc-simple.c: New benchmark.

--
diff --git a/benchtests/Makefile b/benchtests/Makefile
index 12036b1935dc7ea84b421f024d6fe3190ae35a6e..09f7cb8e475a312268eebb4d346edde70d22bb3d 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -90,7 +90,7 @@ CFLAGS-bench-trunc.c += -fno-builtin
 CFLAGS-bench-truncf.c += -fno-builtin
 
 ifeq (${BENCHSET},)
-bench-malloc := malloc-thread
+bench-malloc := malloc-thread malloc-simple
 else
 bench-malloc := $(filter malloc-%,${BENCHSET})
 endif
@@ -98,7 +98,7 @@ endif
 $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
 $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
-$(objpfx)bench-malloc-thread: $(shared-thread-library)
+$(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library)
 
 \f
 
@@ -165,7 +165,7 @@ bench-clean:
 ifneq ($(strip ${BENCHSET}),)
 VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
    wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread
+   malloc-thread malloc-simple
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
@@ -194,10 +194,18 @@ bench-set: $(binaries-benchset)
 
 bench-malloc: $(binaries-bench-malloc)
 	for run in $^; do \
+	  echo "$${run}"; \
+	  if [ `basename $${run}` = "bench-malloc-thread" ]; then \
 		for thr in 1 8 16 32; do \
 			echo "Running $${run} $${thr}"; \
-	  $(run-bench) $${thr} > $${run}-$${thr}.out; \
-	  done;\
+			$(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  else \
+		for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \
+		  echo "Running $${run} $${thr}"; \
+		  $(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  fi;\
 	done
 
 # Build and execute the benchmark functions.  This target generates JSON
diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
new file mode 100644
index 0000000000000000000000000000000000000000..83203ff3187654a1710c9ef81016f854957b9d64
--- /dev/null
+++ b/benchtests/bench-malloc-simple.c
@@ -0,0 +1,188 @@
+/* Benchmark malloc and free functions.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <sys/resource.h>
+#include "bench-timing.h"
+#include "json-lib.h"
+
+/* Benchmark the malloc/free performance of a varying number of blocks of a
+   given size.  This enables performance tracking of the t-cache and fastbins.
+   It tests 3 different scenarios: single-threaded using main arena,
+   multi-threaded using thread-arena, and main arena with SINGLE_THREAD_P
+   false.  */
+
+#define NUM_ITERS 200000
+#define NUM_ALLOCS 4
+#define MAX_ALLOCS 1600
+
+typedef struct
+{
+  size_t iters;
+  size_t size;
+  int n;
+  timing_t elapsed;
+} malloc_args;
+
+static void
+do_benchmark (malloc_args *args, int **arr)
+{
+  timing_t start, stop;
+  size_t iters = args->iters;
+  size_t size = args->size;
+  int n = args->n;
+
+  TIMING_NOW (start);
+
+  for (int j = 0; j < iters; j++)
+    {
+      for (int i = 0; i < n; i++)
+	arr[i] = malloc (size);
+
+      for (int i = 0; i < n; i++)
+	free (arr[i]);
+    }
+
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (args->elapsed, start, stop);
+}
+
+static malloc_args tests[3][NUM_ALLOCS];
+static int allocs[NUM_ALLOCS] = { 25, 100, 400, MAX_ALLOCS };
+
+static void *
+thread_test (void *p)
+{
+  int **arr = (int**)p;
+
+  /* Run benchmark multi-threaded.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[2][i], arr);
+
+  return p;
+}
+
+void
+bench (unsigned long size)
+{
+  size_t iters = NUM_ITERS;
+  int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
+  unsigned long res;
+
+  TIMING_INIT (res);
+
+  for (int t = 0; t <= 3; t++)
+    for (int i = 0; i < NUM_ALLOCS; i++)
+      {
+	tests[t][i].n = allocs[i];
+	tests[t][i].size = size;
+	tests[t][i].iters = iters / allocs[i];
+
+	/* Do a quick warmup run.  */
+	if (t == 0)
+	  do_benchmark (&tests[0][i], arr);
+      }
+
+  /* Run benchmark single threaded in main_arena.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[0][i], arr);
+
+  /* Run benchmark in a thread_arena.  */
+  pthread_t t;
+  pthread_create (&t, NULL, thread_test, (void*)arr);
+  pthread_join (t, NULL);
+
+  /* Repeat benchmark in main_arena with SINGLE_THREAD_P == false.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[1][i], arr);
+
+  free (arr);
+
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+
+  json_attr_object_begin (&json_ctx, "malloc");
+
+  char s[100];
+  double iters2 = iters;
+
+  json_attr_object_begin (&json_ctx, "");
+  json_attr_double (&json_ctx, "malloc_block_size", size);
+
+  struct rusage usage;
+  getrusage (RUSAGE_SELF, &usage);
+  json_attr_double (&json_ctx, "max_rss", usage.ru_maxrss);
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_st_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[0][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_mt_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[1][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "thread_arena__allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[2][i].elapsed / iters2);
+    }
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_document_end (&json_ctx);
+}
+
+static void usage (const char *name)
+{
+  fprintf (stderr, "%s: <alloc_size>\n", name);
+  exit (1);
+}
+
+int
+main (int argc, char **argv)
+{
+  long val = 16;
+  if (argc == 2)
+    val = strtol (argv[1], NULL, 0);
+
+  if (argc > 2 || val <= 0)
+    usage (argv[0]);
+
+  bench (val);
+
+  return 0;
+}

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2019-02-14 16:38   ` Wilco Dijkstra
@ 2019-02-14 20:42     ` DJ Delorie
  0 siblings, 0 replies; 15+ messages in thread
From: DJ Delorie @ 2019-02-14 20:42 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: libc-alpha, nd


Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:

> +/* Benchmark the malloc/free performance of a varying number of blocks of a
> +   given size.  This enables performance tracking of the t-cache and fastbins.
> +   It tests 3 different scenarios: single-threaded using main arena,
> +   multi-threaded using thread-arena, and main arena with SINGLE_THREAD_P
> +   false.  */

Excellent!

>> I wonder if this could be done more elegantly, but I'm OK with a simple
>> approach for now.  If we end up adding many more such tests we might
>> need to revisit this part.
>
> The main concern was to get a clean state so that the test of a previous block
> size doesn't affect subsequent results.

Sorry, I meant a more efficient way to structure the Makefile, not the
test itself ;-)

>> How long does this test take to run, on average, compared to other
>> tests?  Do we have to worry about increasing timeouts for slow hosts?
>
> All the tests together runs finish in a fraction of the time taken by a single
> test of bench-malloc-thread, so if anything we need to reduce the time of
> that one by an order of magnitude (it takes ~5 minutes!).

Ok, thanks.

>> Simple loop, but doesn't test for malloc returning NULL.
>
> Yeah, the benchmark doesn't need to care since the amount we allocate
> is tiny (6.4MBytes).

I still think it's a good idea to check it, else we might end up with
artificially good results from free(NULL).


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2019-02-01 16:27 [PATCH] Add malloc micro benchmark Wilco Dijkstra
  2019-02-08 19:37 ` DJ Delorie
@ 2019-02-28  4:52 ` Carlos O'Donell
  2019-03-04 17:35   ` Wilco Dijkstra
  1 sibling, 1 reply; 15+ messages in thread
From: Carlos O'Donell @ 2019-02-28  4:52 UTC (permalink / raw)
  To: Wilco Dijkstra, 'GNU C Library'; +Cc: nd, Florian Weimer

On 2/1/19 11:27 AM, Wilco Dijkstra wrote:
> Add a malloc micro benchmark to enable accurate testing of the
> various paths in malloc and free.  The benchmark does a varying
> number of allocations of a given block size, then frees them again.
> 
> It tests 3 different scenarios: single-threaded using main arena,
> multi-threaded using thread-arena, main arena with SINGLE_THREAD_P
> false.
> 
> OK for commit?
> 
> ChangeLog:
> 2019-02-01  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* benchtests/Makefile: Add malloc-simple benchmark.
> 	* benchtests/bench-malloc-simple.c: New benchmark.

This broke Fedora Rawhide during CI testing:

BUILDSTDERR: bench-malloc-simple.c: In function 'bench':
BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable]
BUILDSTDERR:    89 |   unsigned long res;
BUILDSTDERR:       |                 ^~~
BUILDSTDERR: cc1: all warnings being treated as errors

Affects aarch64, armv7hl, and s390x.

I assume we need a "(void) res" like we have in bench-malloc-thread.c?

I'm going to checkin a quick fix to Rawhide and report back if anything
else breaks.

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2019-02-28  4:52 ` Carlos O'Donell
@ 2019-03-04 17:35   ` Wilco Dijkstra
  2019-03-18 17:16     ` Wilco Dijkstra
  0 siblings, 1 reply; 15+ messages in thread
From: Wilco Dijkstra @ 2019-03-04 17:35 UTC (permalink / raw)
  To: Carlos O'Donell, 'GNU C Library'; +Cc: nd, Florian Weimer

Hi Carlos,

> BUILDSTDERR: bench-malloc-simple.c: In function 'bench':
> BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable]
> BUILDSTDERR:    89 |   unsigned long res;
> BUILDSTDERR:       |                 ^~~
> BUILDSTDERR: cc1: all warnings being treated as errors
>
> Affects aarch64, armv7hl, and s390x.
> 
> I assume we need a "(void) res" like we have in bench-malloc-thread.c?
> 
> I'm going to checkin a quick fix to Rawhide and report back if anything
> else breaks.

Does that enable extra errors somehow? I can't reproduce it.

Anyway TIMING_INIT is redundant for bench-malloc-*.c, so here's a
patch to just kill it:


Remove TIMING_INIT since it's only used in bench-skeleton.c if there
is no hp-timing support (which will become the default after [1]).

[1] https://sourceware.org/ml/libc-alpha/2019-02/msg00468.html

ChangeLog:
2019-03-04  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/bench-malloc-simple.c: Remove TIMING_INIT.
	* benchtests/bench-malloc-thread.c: Likewise.
	* benchtests/bench-skeleton.c: Likewise.
	* benchtests/bench-strtod.c: Likewise.
	* benchtests/bench-timing.h: Likewise.

--

diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
index 83203ff3187654a1710c9ef81016f854957b9d64..b8bb2cc116953c6691c17633d18c5661c7d9243e 100644
--- a/benchtests/bench-malloc-simple.c
+++ b/benchtests/bench-malloc-simple.c
@@ -86,9 +86,6 @@ bench (unsigned long size)
 {
   size_t iters = NUM_ITERS;
   int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
-  unsigned long res;
-
-  TIMING_INIT (res);
 
   for (int t = 0; t <= 3; t++)
     for (int i = 0; i < NUM_ALLOCS; i++)
diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c
index bb4ba727a88059ecbe7305f5b8ad1693c1f1f266..52261425b0f1af32c17328ea5e0a5bb6f230df47 100644
--- a/benchtests/bench-malloc-thread.c
+++ b/benchtests/bench-malloc-thread.c
@@ -225,7 +225,6 @@ main (int argc, char **argv)
 {
   timing_t cur;
   size_t iters = 0, num_threads = 1;
-  unsigned long res;
   json_ctx_t json_ctx;
   double d_total_s, d_total_i;
   struct sigaction act;
@@ -261,10 +260,6 @@ main (int argc, char **argv)
 
   json_attr_object_begin (&json_ctx, "");
 
-  TIMING_INIT (res);
-
-  (void) res;
-
   memset (&act, 0, sizeof (act));
   act.sa_handler = &alarm_handler;
 
diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 37625c4296882268f6260d99adbc7f0295164ffc..854151e5a82028e74fe3a966e82004572542f411 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -48,14 +48,11 @@ main (int argc, char **argv)
 
   memset (&runtime, 0, sizeof (runtime));
 
-  unsigned long iters, res;
+  unsigned long iters = 1000;
 
 #ifdef BENCH_INIT
   BENCH_INIT ();
 #endif
-  TIMING_INIT (res);
-
-  iters = 1000 * res;
 
   json_init (&json_ctx, 2, stdout);
 
diff --git a/benchtests/bench-strtod.c b/benchtests/bench-strtod.c
index 4de0b9acb67eb925a80249322957ce8b3c08c8d6..d5b2503553ef74f33cace919ae9c62f79cd11c9c 100644
--- a/benchtests/bench-strtod.c
+++ b/benchtests/bench-strtod.c
@@ -89,9 +89,6 @@ int
 do_bench (void)
 {
   const size_t iters = INNER_LOOP_ITERS;
-  timing_t res __attribute__ ((unused));
-
-  TIMING_INIT (res);
 
   for (size_t i = 0; inputs[i] != NULL; ++i)
     {
diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h
index 41b7324527b9deed67b3479cb1308fbd291bc5ca..f9b19fcd29efb45ea02c375e37caba94c93956d1 100644
--- a/benchtests/bench-timing.h
+++ b/benchtests/bench-timing.h
@@ -28,8 +28,6 @@ typedef hp_timing_t timing_t;
 
 # define TIMING_TYPE "hp_timing"
 
-# define TIMING_INIT(res) ({ (res) = 1; })
-
 # define TIMING_NOW(var) HP_TIMING_NOW (var)
 # define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end))
 # define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff))
@@ -41,15 +39,6 @@ typedef uint64_t timing_t;
 
 # define TIMING_TYPE "clock_gettime"
 
-/* Measure the resolution of the clock so we can scale the number of
-   benchmark iterations by this value.  */
-# define TIMING_INIT(res) \
-({									      \
-  struct timespec start;						      \
-  clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start);			      \
-  (res) = start.tv_nsec;					      \
-})
-
 # define TIMING_NOW(var) \
 ({									      \
   struct timespec tv;							      \

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2019-03-04 17:35   ` Wilco Dijkstra
@ 2019-03-18 17:16     ` Wilco Dijkstra
  2019-04-09  5:25       ` Carlos O'Donell
  0 siblings, 1 reply; 15+ messages in thread
From: Wilco Dijkstra @ 2019-03-18 17:16 UTC (permalink / raw)
  To: Carlos O'Donell, 'GNU C Library'; +Cc: nd, Florian Weimer

ping
  

Hi Carlos,

> BUILDSTDERR: bench-malloc-simple.c: In function 'bench':
> BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable]
> BUILDSTDERR:    89 |   unsigned long res;
> BUILDSTDERR:       |                 ^~~
> BUILDSTDERR: cc1: all warnings being treated as errors
>
> Affects aarch64, armv7hl, and s390x.
> 
> I assume we need a "(void) res" like we have in bench-malloc-thread.c?
> 
> I'm going to checkin a quick fix to Rawhide and report back if anything
> else breaks.

Does that enable extra errors somehow? I can't reproduce it.

Anyway TIMING_INIT is redundant for bench-malloc-*.c, so here's a
patch to just kill it:


Remove TIMING_INIT since it's only used in bench-skeleton.c if there
is no hp-timing support (which will become the default after [1]).

[1] https://sourceware.org/ml/libc-alpha/2019-02/msg00468.html

ChangeLog:
2019-03-04  Wilco Dijkstra  <wdijkstr@arm.com>

        * benchtests/bench-malloc-simple.c: Remove TIMING_INIT.
        * benchtests/bench-malloc-thread.c: Likewise.
        * benchtests/bench-skeleton.c: Likewise.
        * benchtests/bench-strtod.c: Likewise.
        * benchtests/bench-timing.h: Likewise.

--

diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
index 83203ff3187654a1710c9ef81016f854957b9d64..b8bb2cc116953c6691c17633d18c5661c7d9243e 100644
--- a/benchtests/bench-malloc-simple.c
+++ b/benchtests/bench-malloc-simple.c
@@ -86,9 +86,6 @@ bench (unsigned long size)
 {
   size_t iters = NUM_ITERS;
   int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
-  unsigned long res;
-
-  TIMING_INIT (res);
 
   for (int t = 0; t <= 3; t++)
     for (int i = 0; i < NUM_ALLOCS; i++)
diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c
index bb4ba727a88059ecbe7305f5b8ad1693c1f1f266..52261425b0f1af32c17328ea5e0a5bb6f230df47 100644
--- a/benchtests/bench-malloc-thread.c
+++ b/benchtests/bench-malloc-thread.c
@@ -225,7 +225,6 @@ main (int argc, char **argv)
 {
   timing_t cur;
   size_t iters = 0, num_threads = 1;
-  unsigned long res;
   json_ctx_t json_ctx;
   double d_total_s, d_total_i;
   struct sigaction act;
@@ -261,10 +260,6 @@ main (int argc, char **argv)
 
   json_attr_object_begin (&json_ctx, "");
 
-  TIMING_INIT (res);
-
-  (void) res;
-
   memset (&act, 0, sizeof (act));
   act.sa_handler = &alarm_handler;
 
diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 37625c4296882268f6260d99adbc7f0295164ffc..854151e5a82028e74fe3a966e82004572542f411 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -48,14 +48,11 @@ main (int argc, char **argv)
 
   memset (&runtime, 0, sizeof (runtime));
 
-  unsigned long iters, res;
+  unsigned long iters = 1000;
 
 #ifdef BENCH_INIT
   BENCH_INIT ();
 #endif
-  TIMING_INIT (res);
-
-  iters = 1000 * res;
 
   json_init (&json_ctx, 2, stdout);
 
diff --git a/benchtests/bench-strtod.c b/benchtests/bench-strtod.c
index 4de0b9acb67eb925a80249322957ce8b3c08c8d6..d5b2503553ef74f33cace919ae9c62f79cd11c9c 100644
--- a/benchtests/bench-strtod.c
+++ b/benchtests/bench-strtod.c
@@ -89,9 +89,6 @@ int
 do_bench (void)
 {
   const size_t iters = INNER_LOOP_ITERS;
-  timing_t res __attribute__ ((unused));
-
-  TIMING_INIT (res);
 
   for (size_t i = 0; inputs[i] != NULL; ++i)
     {
diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h
index 41b7324527b9deed67b3479cb1308fbd291bc5ca..f9b19fcd29efb45ea02c375e37caba94c93956d1 100644
--- a/benchtests/bench-timing.h
+++ b/benchtests/bench-timing.h
@@ -28,8 +28,6 @@ typedef hp_timing_t timing_t;
 
 # define TIMING_TYPE "hp_timing"
 
-# define TIMING_INIT(res) ({ (res) = 1; })
-
 # define TIMING_NOW(var) HP_TIMING_NOW (var)
 # define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end))
 # define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff))
@@ -41,15 +39,6 @@ typedef uint64_t timing_t;
 
 # define TIMING_TYPE "clock_gettime"
 
-/* Measure the resolution of the clock so we can scale the number of
-   benchmark iterations by this value.  */
-# define TIMING_INIT(res) \
-({                                                                           \
-  struct timespec start;                                                     \
-  clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start);                           \
-  (res) = start.tv_nsec;                                             \
-})
-
 # define TIMING_NOW(var) \
 ({                                                                            \
   struct timespec tv;                                                        \
    

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] Add malloc micro benchmark
  2019-03-18 17:16     ` Wilco Dijkstra
@ 2019-04-09  5:25       ` Carlos O'Donell
  0 siblings, 0 replies; 15+ messages in thread
From: Carlos O'Donell @ 2019-04-09  5:25 UTC (permalink / raw)
  To: Wilco Dijkstra, Carlos O'Donell, 'GNU C Library'
  Cc: nd, Florian Weimer

On 3/18/19 1:16 PM, Wilco Dijkstra wrote:
> ping
>    
> 
> Hi Carlos,
> 
>> BUILDSTDERR: bench-malloc-simple.c: In function 'bench':
>> BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable]
>> BUILDSTDERR:    89 |   unsigned long res;
>> BUILDSTDERR:       |                 ^~~
>> BUILDSTDERR: cc1: all warnings being treated as errors
>>
>> Affects aarch64, armv7hl, and s390x.
>>
>> I assume we need a "(void) res" like we have in bench-malloc-thread.c?
>>
>> I'm going to checkin a quick fix to Rawhide and report back if anything
>> else breaks.
> 
> Does that enable extra errors somehow? I can't reproduce it.
> 
> Anyway TIMING_INIT is redundant for bench-malloc-*.c, so here's a
> patch to just kill it:

LGTM.

Sorry for the delay.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>

> 
> Remove TIMING_INIT since it's only used in bench-skeleton.c if there
> is no hp-timing support (which will become the default after [1]).
> 
> [1] https://sourceware.org/ml/libc-alpha/2019-02/msg00468.html
> 
> ChangeLog:
> 2019-03-04  Wilco Dijkstra  <wdijkstr@arm.com>
> 
>          * benchtests/bench-malloc-simple.c: Remove TIMING_INIT.
>          * benchtests/bench-malloc-thread.c: Likewise.
>          * benchtests/bench-skeleton.c: Likewise.
>          * benchtests/bench-strtod.c: Likewise.
>          * benchtests/bench-timing.h: Likewise.
> 
> --
> 
> diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
> index 83203ff3187654a1710c9ef81016f854957b9d64..b8bb2cc116953c6691c17633d18c5661c7d9243e 100644
> --- a/benchtests/bench-malloc-simple.c
> +++ b/benchtests/bench-malloc-simple.c
> @@ -86,9 +86,6 @@ bench (unsigned long size)
>   {
>     size_t iters = NUM_ITERS;
>     int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
> -  unsigned long res;
> -
> -  TIMING_INIT (res);

OK.

>   
>     for (int t = 0; t <= 3; t++)
>       for (int i = 0; i < NUM_ALLOCS; i++)
> diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c
> index bb4ba727a88059ecbe7305f5b8ad1693c1f1f266..52261425b0f1af32c17328ea5e0a5bb6f230df47 100644
> --- a/benchtests/bench-malloc-thread.c
> +++ b/benchtests/bench-malloc-thread.c
> @@ -225,7 +225,6 @@ main (int argc, char **argv)
>   {
>     timing_t cur;
>     size_t iters = 0, num_threads = 1;
> -  unsigned long res;

OK.

>     json_ctx_t json_ctx;
>     double d_total_s, d_total_i;
>     struct sigaction act;
> @@ -261,10 +260,6 @@ main (int argc, char **argv)
>   
>     json_attr_object_begin (&json_ctx, "");
>   
> -  TIMING_INIT (res);
> -
> -  (void) res;

OK.

> -
>     memset (&act, 0, sizeof (act));
>     act.sa_handler = &alarm_handler;
>   
> diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
> index 37625c4296882268f6260d99adbc7f0295164ffc..854151e5a82028e74fe3a966e82004572542f411 100644
> --- a/benchtests/bench-skeleton.c
> +++ b/benchtests/bench-skeleton.c
> @@ -48,14 +48,11 @@ main (int argc, char **argv)
>   
>     memset (&runtime, 0, sizeof (runtime));
>   
> -  unsigned long iters, res;
> +  unsigned long iters = 1000;

OK. A fixed number of iterations will do.

>   
>   #ifdef BENCH_INIT
>     BENCH_INIT ();
>   #endif
> -  TIMING_INIT (res);
> -
> -  iters = 1000 * res;

OK.

>   
>     json_init (&json_ctx, 2, stdout);
>   
> diff --git a/benchtests/bench-strtod.c b/benchtests/bench-strtod.c
> index 4de0b9acb67eb925a80249322957ce8b3c08c8d6..d5b2503553ef74f33cace919ae9c62f79cd11c9c 100644
> --- a/benchtests/bench-strtod.c
> +++ b/benchtests/bench-strtod.c
> @@ -89,9 +89,6 @@ int
>   do_bench (void)
>   {
>     const size_t iters = INNER_LOOP_ITERS;
> -  timing_t res __attribute__ ((unused));
> -
> -  TIMING_INIT (res);

OK.

>   
>     for (size_t i = 0; inputs[i] != NULL; ++i)
>       {
> diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h
> index 41b7324527b9deed67b3479cb1308fbd291bc5ca..f9b19fcd29efb45ea02c375e37caba94c93956d1 100644
> --- a/benchtests/bench-timing.h
> +++ b/benchtests/bench-timing.h
> @@ -28,8 +28,6 @@ typedef hp_timing_t timing_t;
>   
>   # define TIMING_TYPE "hp_timing"
>   
> -# define TIMING_INIT(res) ({ (res) = 1; })

OK.

> -
>   # define TIMING_NOW(var) HP_TIMING_NOW (var)
>   # define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end))
>   # define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff))
> @@ -41,15 +39,6 @@ typedef uint64_t timing_t;
>   
>   # define TIMING_TYPE "clock_gettime"
>   
> -/* Measure the resolution of the clock so we can scale the number of
> -   benchmark iterations by this value.  */
> -# define TIMING_INIT(res) \
> -({                                                                           \
> -  struct timespec start;                                                     \
> -  clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start);                           \
> -  (res) = start.tv_nsec;                                             \
> -})

OK.

> -
>   # define TIMING_NOW(var) \
>   ({                                                                            \
>     struct timespec tv;                                                        \
>      
> 


-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2019-04-09  5:26 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-01 16:27 [PATCH] Add malloc micro benchmark Wilco Dijkstra
2019-02-08 19:37 ` DJ Delorie
2019-02-14 16:38   ` Wilco Dijkstra
2019-02-14 20:42     ` DJ Delorie
2019-02-28  4:52 ` Carlos O'Donell
2019-03-04 17:35   ` Wilco Dijkstra
2019-03-18 17:16     ` Wilco Dijkstra
2019-04-09  5:25       ` Carlos O'Donell
  -- strict thread matches above, loose matches on Subject: below --
2017-12-01 13:51 Wilco Dijkstra
2017-12-01 16:13 ` Carlos O'Donell
2017-12-18 15:18   ` Wilco Dijkstra
2017-12-18 16:32     ` Carlos O'Donell
2017-12-18 23:02     ` DJ Delorie
2017-12-28 14:09       ` Wilco Dijkstra
2017-12-28 19:01         ` DJ Delorie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).