unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] Add an internal wrapper for clone, clone2 and clone3
@ 2021-02-14 22:45 H.J. Lu via Libc-alpha
  2021-02-15 10:44 ` Florian Weimer via Libc-alpha
  0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-02-14 22:45 UTC (permalink / raw)
  To: libc-alpha

The clone3 system call provides a superset of the functionality of clone
and clone2.  It also provides a number of API improve ments, including
the ability to specify the size of the child's stack area which can be
used by kernel to compute the shadow stack size when allocating the
shadow stack.  Add:

extern int __clone_internal (struct clone_args *cl_args, size_t size,
			     int (*__fn) (void *__arg), void *__arg);

to provide an abstract interface for clone, clone2 and clone3.

1. Simplify stack management for clone by passing stack base and size
to __clone_internal.
2. Consolidate clone vs clone2 differences into a single file.
3. Use only __clone_internal to clone a thread.  If clone3 returns -1
with ENOSYS, __clone_internal will fall back to clone or clone2.
4. Add the x86-64 clone3 wrapper.
5. Enable the public clone3 wrapper in the future after it has been
added to add targets.

Tested with build-many-glibcs.py.  The x86-64 clone3 wrapper and the
i686 clone fallback are tested under 5.10.15 kernel.
---
 include/sched.h                           |   3 +
 nptl/allocatestack.c                      |  59 ++----------
 nptl/createthread.c                       |   3 +-
 nptl/pthread_create.c                     |  17 ++--
 sysdeps/generic/clone-internal.h          |  49 ++++++++++
 sysdeps/unix/sysv/linux/Makefile          |   4 +-
 sysdeps/unix/sysv/linux/Versions          |   1 +
 sysdeps/unix/sysv/linux/clone-internal.c  |  61 +++++++++++++
 sysdeps/unix/sysv/linux/clone-offsets.sym |   9 ++
 sysdeps/unix/sysv/linux/clone3.c          |  31 +++++++
 sysdeps/unix/sysv/linux/createthread.c    |  24 ++---
 sysdeps/unix/sysv/linux/spawni.c          |  26 ++----
 sysdeps/unix/sysv/linux/tst-align-clone.c |  23 +++--
 sysdeps/unix/sysv/linux/tst-clone.c       |  12 +--
 sysdeps/unix/sysv/linux/tst-clone2.c      |  25 +++---
 sysdeps/unix/sysv/linux/tst-clone3.c      |  28 +++---
 sysdeps/unix/sysv/linux/tst-getpid1.c     |  25 +++---
 sysdeps/unix/sysv/linux/x86_64/clone3.S   | 105 ++++++++++++++++++++++
 18 files changed, 347 insertions(+), 158 deletions(-)
 create mode 100644 sysdeps/generic/clone-internal.h
 create mode 100644 sysdeps/unix/sysv/linux/clone-internal.c
 create mode 100644 sysdeps/unix/sysv/linux/clone-offsets.sym
 create mode 100644 sysdeps/unix/sysv/linux/clone3.c
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/clone3.S

diff --git a/include/sched.h b/include/sched.h
index b0bf971c93..28dd99a571 100644
--- a/include/sched.h
+++ b/include/sched.h
@@ -32,5 +32,8 @@ libc_hidden_proto (__clone2)
    and Hurd doesn't have it.  */
 extern int __getcpu (unsigned int *, unsigned int *);
 libc_hidden_proto (__getcpu)
+
+# include <clone-internal.h>
+libc_hidden_proto (__clone_internal)
 #endif
 #endif
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index 149b999603..73c7f33a00 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -34,47 +34,6 @@
 #include <stack-aliasing.h>
 
 
-#ifndef NEED_SEPARATE_REGISTER_STACK
-
-/* Most architectures have exactly one stack pointer.  Some have more.  */
-# define STACK_VARIABLES void *stackaddr = NULL
-
-/* How to pass the values to the 'create_thread' function.  */
-# define STACK_VARIABLES_ARGS stackaddr
-
-/* How to declare function which gets there parameters.  */
-# define STACK_VARIABLES_PARMS void *stackaddr
-
-/* How to declare allocate_stack.  */
-# define ALLOCATE_STACK_PARMS void **stack
-
-/* This is how the function is called.  We do it this way to allow
-   other variants of the function to have more parameters.  */
-# define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
-
-#else
-
-/* We need two stacks.  The kernel will place them but we have to tell
-   the kernel about the size of the reserved address space.  */
-# define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
-
-/* How to pass the values to the 'create_thread' function.  */
-# define STACK_VARIABLES_ARGS stackaddr, stacksize
-
-/* How to declare function which gets there parameters.  */
-# define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
-
-/* How to declare allocate_stack.  */
-# define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
-
-/* This is how the function is called.  We do it this way to allow
-   other variants of the function to have more parameters.  */
-# define ALLOCATE_STACK(attr, pd) \
-  allocate_stack (attr, pd, &stackaddr, &stacksize)
-
-#endif
-
-
 /* Default alignment of stack.  */
 #ifndef STACK_ALIGN
 # define STACK_ALIGN __alignof__ (long double)
@@ -399,7 +358,7 @@ advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
    PDP must be non-NULL.  */
 static int
 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
-		ALLOCATE_STACK_PARMS)
+		void **stack, size_t *stacksize)
 {
   struct pthread *pd;
   size_t size;
@@ -760,25 +719,17 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
   /* We place the thread descriptor at the end of the stack.  */
   *pdp = pd;
 
-#if _STACK_GROWS_DOWN
   void *stacktop;
 
-# if TLS_TCB_AT_TP
+#if TLS_TCB_AT_TP
   /* The stack begins before the TCB and the static TLS block.  */
   stacktop = ((char *) (pd + 1) - __static_tls_size);
-# elif TLS_DTV_AT_TP
+#elif TLS_DTV_AT_TP
   stacktop = (char *) (pd - 1);
-# endif
+#endif
 
-# ifdef NEED_SEPARATE_REGISTER_STACK
-  *stack = pd->stackblock;
-  *stacksize = stacktop - *stack;
-# else
-  *stack = stacktop;
-# endif
-#else
+  *stacksize = stacktop - pd->stackblock;
   *stack = pd->stackblock;
-#endif
 
   return 0;
 }
diff --git a/nptl/createthread.c b/nptl/createthread.c
index 46943b33fe..2ac83111ec 100644
--- a/nptl/createthread.c
+++ b/nptl/createthread.c
@@ -25,7 +25,8 @@
 
 static int
 create_thread (struct pthread *pd, const struct pthread_attr *attr,
-	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
+	       bool *stopped_start, void *stackaddr, size_t stacksize,
+	       bool *thread_ran)
 {
   /* If the implementation needs to do some tweaks to the thread after
      it has been created at the OS level, it can set STOPPED_START here.  */
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 6c645aff48..2cc3d25697 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -201,8 +201,8 @@ unsigned int __nptl_nthreads = 1;
    be set to true iff the thread actually started up and then got
    canceled before calling user code (*PD->start_routine).  */
 static int create_thread (struct pthread *pd, const struct pthread_attr *attr,
-			  bool *stopped_start, STACK_VARIABLES_PARMS,
-			  bool *thread_ran);
+			  bool *stopped_start, void *stackaddr,
+			  size_t stacksize, bool *thread_ran);
 
 #include <createthread.c>
 
@@ -621,7 +621,8 @@ int
 __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 		      void *(*start_routine) (void *), void *arg)
 {
-  STACK_VARIABLES;
+  void *stackaddr = NULL;
+  size_t stacksize = 0;
 
   /* Avoid a data race in the multi-threaded case.  */
   if (__libc_single_threaded)
@@ -641,7 +642,7 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
     }
 
   struct pthread *pd = NULL;
-  int err = ALLOCATE_STACK (iattr, &pd);
+  int err = allocate_stack (iattr, &pd, &stackaddr, &stacksize);
   int retval = 0;
 
   if (__glibc_unlikely (err != 0))
@@ -786,8 +787,8 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 
       /* We always create the thread stopped at startup so we can
 	 notify the debugger.  */
-      retval = create_thread (pd, iattr, &stopped_start,
-			      STACK_VARIABLES_ARGS, &thread_ran);
+      retval = create_thread (pd, iattr, &stopped_start, stackaddr,
+			      stacksize, &thread_ran);
       if (retval == 0)
 	{
 	  /* We retain ownership of PD until (a) (see CONCURRENCY NOTES
@@ -818,8 +819,8 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 	}
     }
   else
-    retval = create_thread (pd, iattr, &stopped_start,
-			    STACK_VARIABLES_ARGS, &thread_ran);
+    retval = create_thread (pd, iattr, &stopped_start, stackaddr,
+			    stacksize, &thread_ran);
 
   /* Return to the previous signal mask, after creating the new
      thread.  */
diff --git a/sysdeps/generic/clone-internal.h b/sysdeps/generic/clone-internal.h
new file mode 100644
index 0000000000..1c1f997ca5
--- /dev/null
+++ b/sysdeps/generic/clone-internal.h
@@ -0,0 +1,49 @@
+/* The internal wrapper of clone and clone3.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <stddef.h>
+
+struct clone_args
+{
+  uint64_t flags;	 /* Flags bit mask.  */
+  uint64_t pidfd;	 /* Where to store PID file descriptor
+			    (pid_t *).  */
+  uint64_t child_tid;	 /* Where to store child TID, in child's memory
+			    (pid_t *).  */
+  uint64_t parent_tid;	 /* Where to store child TID, in parent's memory
+			    (int *). */
+  uint64_t exit_signal;	 /* Signal to deliver to parent on child
+			    termination */
+  uint64_t stack;	 /* The lowest address of stack.  */
+  uint64_t stack_size;	 /* Size of stack.  */
+  uint64_t tls;		 /* Location of new TLS.  */
+  uint64_t set_tid;	 /* Pointer to a pid_t array
+			    (since Linux 5.5).  */
+  uint64_t set_tid_size; /* Number of elements in set_tid
+			    (since Linux 5.5). */
+  uint64_t cgroup;	 /* File descriptor for target cgroup
+			    of child (since Linux 5.7).  */
+};
+
+extern int __clone3 (struct clone_args *cl_args, size_t size,
+		     int (*__fn) (void *__arg), void *__arg)
+  __attribute__ ((visibility ("hidden")));
+
+/* The internal wrapper of clone and clone3.  */
+extern __typeof (__clone3) __clone_internal;
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 488114a41e..b747769bb9 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -54,6 +54,8 @@ CFLAGS-malloc.c += -DMORECORE_CLEARS=2
 endif
 
 ifeq ($(subdir),misc)
+gen-as-const-headers += clone-offsets.sym
+
 sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
 		   setfsuid setfsgid epoll_pwait signalfd \
 		   eventfd eventfd_read eventfd_write prlimit \
@@ -64,7 +66,7 @@ sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
 		   time64-support pselect32 \
 		   xstat fxstat lxstat xstat64 fxstat64 lxstat64 \
 		   fxstatat fxstatat64 \
-		   xmknod xmknodat
+		   xmknod xmknodat clone3 clone-internal
 
 CFLAGS-gethostid.c = -fexceptions
 CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables
diff --git a/sysdeps/unix/sysv/linux/Versions b/sysdeps/unix/sysv/linux/Versions
index c35f783e2a..9bb13d593f 100644
--- a/sysdeps/unix/sysv/linux/Versions
+++ b/sysdeps/unix/sysv/linux/Versions
@@ -179,5 +179,6 @@ libc {
     __sigtimedwait;
     # functions used by nscd
     __netlink_assert_response;
+    __clone_internal;
   }
 }
diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
new file mode 100644
index 0000000000..7d755cd97f
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone-internal.c
@@ -0,0 +1,61 @@
+/* The internal wrapper of clone and clone3.  Linux version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <sched.h>
+#include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
+
+int
+__clone_internal (struct clone_args *cl_args, size_t size,
+		  int (*fn)(void *arg), void *arg)
+{
+  /* Try clone3 first.  */
+  int ret = __clone3 (cl_args, size, fn, arg);
+  if (ret == -1 && errno == ENOSYS)
+    {
+      /* NB: Must clear errno since errno may be checked against non-zero
+         return value.  */
+      __set_errno (0);
+      /* Map clone3 arguments to clone arguments.  */
+      int flags = cl_args->flags | cl_args->exit_signal;
+      void *stack = (void *) (uintptr_t) cl_args->stack;
+
+#ifdef __ia64__
+      ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,
+		      flags, arg,
+		      (void *) (uintptr_t) cl_args->parent_tid,
+		      (void *) (uintptr_t) cl_args->tls,
+		      (void *) (uintptr_t) cl_args->child_tid);
+#else
+# if !_STACK_GROWS_DOWN && !_STACK_GROWS_UP
+#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
+# endif
+
+# if _STACK_GROWS_DOWN
+      stack += cl_args->stack_size;
+# endif
+      ret = __clone (fn, stack, flags, arg,
+		     (void *) (uintptr_t) cl_args->parent_tid,
+		     (void *) (uintptr_t) cl_args->tls,
+		     (void *) (uintptr_t) cl_args->child_tid);
+#endif
+    }
+  return ret;
+}
+
+libc_hidden_def (__clone_internal)
diff --git a/sysdeps/unix/sysv/linux/clone-offsets.sym b/sysdeps/unix/sysv/linux/clone-offsets.sym
new file mode 100644
index 0000000000..38d447bcf3
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone-offsets.sym
@@ -0,0 +1,9 @@
+#include <stddef.h>
+#include <sched.h>
+
+--
+
+#define clone_args_offset(member) offsetof (struct clone_args, member)
+
+CLONE_ARGS_STACK_OFFSET		clone_args_offset(stack)
+CLONE_ARGS_STACK_SIZE_OFFSET	clone_args_offset(stack_size)
diff --git a/sysdeps/unix/sysv/linux/clone3.c b/sysdeps/unix/sysv/linux/clone3.c
new file mode 100644
index 0000000000..c7d88e7af7
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone3.c
@@ -0,0 +1,31 @@
+/* The clone3 syscall wrapper.  Linux version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <sched.h>
+
+int
+__clone3 (struct clone_args *cl_args, size_t size,
+	  int (*fn)(void *arg), void *arg)
+{
+  if (cl_args == NULL || fn == NULL)
+    __set_errno (EINVAL);
+  else
+    __set_errno (ENOSYS);
+  return -1;
+}
diff --git a/sysdeps/unix/sysv/linux/createthread.c b/sysdeps/unix/sysv/linux/createthread.c
index bc3409b326..599c172de8 100644
--- a/sysdeps/unix/sysv/linux/createthread.c
+++ b/sysdeps/unix/sysv/linux/createthread.c
@@ -28,12 +28,6 @@
 
 #include <arch-fork.h>
 
-#ifdef __NR_clone2
-# define ARCH_CLONE __clone2
-#else
-# define ARCH_CLONE __clone
-#endif
-
 /* See the comments in pthread_create.c for the requirements for these
    two macros and the create_thread function.  */
 
@@ -47,7 +41,8 @@ static int start_thread (void *arg) __attribute__ ((noreturn));
 
 static int
 create_thread (struct pthread *pd, const struct pthread_attr *attr,
-	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
+	       bool *stopped_start, void *stackaddr, size_t stacksize,
+	       bool *thread_ran)
 {
   /* Determine whether the newly created threads has to be started
      stopped since we have to set the scheduling parameters or set the
@@ -100,9 +95,18 @@ create_thread (struct pthread *pd, const struct pthread_attr *attr,
 
   TLS_DEFINE_INIT_TP (tp, pd);
 
-  if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
-				    clone_flags, pd, &pd->tid, tp, &pd->tid)
-			== -1))
+  struct clone_args args =
+    {
+      .flags = clone_flags,
+      .pidfd = (uintptr_t) &pd->tid,
+      .parent_tid = (uintptr_t) &pd->tid,
+      .child_tid = (uintptr_t) &pd->tid,
+      .stack = (uintptr_t) stackaddr,
+      .stack_size = stacksize,
+      .tls = (uintptr_t) tp,
+    };
+  int ret = __clone_internal (&args, sizeof (args), &start_thread, pd);
+  if (__glibc_unlikely (ret == -1))
     return errno;
 
   /* It's started now, so if we fail below, we'll have to cancel it
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index b53b81b8fc..c63e2b65f1 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -59,21 +59,6 @@
    normal program exit with the exit code 127.  */
 #define SPAWN_ERROR	127
 
-#ifdef __ia64__
-# define CLONE(__fn, __stackbase, __stacksize, __flags, __args) \
-  __clone2 (__fn, __stackbase, __stacksize, __flags, __args, 0, 0, 0)
-#else
-# define CLONE(__fn, __stack, __stacksize, __flags, __args) \
-  __clone (__fn, __stack, __flags, __args)
-#endif
-
-/* Since ia64 wants the stackbase w/clone2, re-use the grows-up macro.  */
-#if _STACK_GROWS_UP || defined (__ia64__)
-# define STACK(__stack, __stack_size) (__stack)
-#elif _STACK_GROWS_DOWN
-# define STACK(__stack, __stack_size) (__stack + __stack_size)
-#endif
-
 
 struct posix_spawn_args
 {
@@ -379,8 +364,15 @@ __spawnix (pid_t * pid, const char *file,
      need for CLONE_SETTLS.  Although parent and child share the same TLS
      namespace, there will be no concurrent access for TLS variables (errno
      for instance).  */
-  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
-		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
+  struct clone_args clone_args =
+    {
+      .flags = CLONE_VM | CLONE_VFORK,
+      .exit_signal = SIGCHLD,
+      .stack = (uintptr_t) stack,
+      .stack_size = stack_size,
+    };
+  new_pid = __clone_internal (&clone_args, sizeof (clone_args),
+			      __spawni_child, &args);
 
   /* It needs to collect the case where the auxiliary process was created
      but failed to execute the file (due either any preparation step or
diff --git a/sysdeps/unix/sysv/linux/tst-align-clone.c b/sysdeps/unix/sysv/linux/tst-align-clone.c
index 6ace61bac3..5f00b8ee4a 100644
--- a/sysdeps/unix/sysv/linux/tst-align-clone.c
+++ b/sysdeps/unix/sysv/linux/tst-align-clone.c
@@ -23,6 +23,7 @@
 #include <sys/wait.h>
 #include <unistd.h>
 #include <tst-stack-align.h>
+#include <clone-internal.h>
 #include <stackinfo.h>
 
 static int
@@ -49,21 +50,17 @@ do_test (void)
     ok = false;
 
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024];
-  pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
+# define STACK_SIZE 256 * 1024
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
-# if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), 0, 0);
-# elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, 0, 0);
-# else
-#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-# endif
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+  struct clone_args clone_args =
+    {
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+    };
+  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
   if (p == -1)
     {
       printf("clone failed: %m\n");
diff --git a/sysdeps/unix/sysv/linux/tst-clone.c b/sysdeps/unix/sysv/linux/tst-clone.c
index e6ae0106ef..2865182f05 100644
--- a/sysdeps/unix/sysv/linux/tst-clone.c
+++ b/sysdeps/unix/sysv/linux/tst-clone.c
@@ -22,11 +22,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <sched.h>
-
-#ifdef __ia64__
-extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		     size_t __child_stack_size, int __flags, void *__arg, ...);
-#endif
+#include <clone-internal.h>
 
 int child_fn(void *arg)
 {
@@ -39,11 +35,7 @@ do_test (void)
 {
   int result;
 
-#ifdef __ia64__
-  result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
-#else
-  result = clone (child_fn, NULL, 0, NULL);
-#endif
+  result = __clone_internal (NULL, 0, child_fn, NULL);
 
   if (errno != EINVAL || result != -1)
     {
diff --git a/sysdeps/unix/sysv/linux/tst-clone2.c b/sysdeps/unix/sysv/linux/tst-clone2.c
index ce36c70c0d..fda77ac48b 100644
--- a/sysdeps/unix/sysv/linux/tst-clone2.c
+++ b/sysdeps/unix/sysv/linux/tst-clone2.c
@@ -26,9 +26,11 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <errno.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <sys/syscall.h>
+#include <clone-internal.h>
 
 #include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
 
@@ -70,23 +72,18 @@ do_test (void)
   if (pipe2 (pipefd, O_CLOEXEC))
     FAIL_EXIT1 ("pipe failed: %m");
 
-  int clone_flags = 0;
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024] __attribute__ ((aligned));
-  pid_t p = __clone2 (f, st, sizeof (st), clone_flags, 0);
+# define STACK_SIZE 256 * 1024
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
-#if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), clone_flags, 0);
-#elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, clone_flags, 0);
-#else
-#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-#endif
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+  struct clone_args clone_args =
+    {
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+    };
+  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
 
   close (pipefd[1]);
 
diff --git a/sysdeps/unix/sysv/linux/tst-clone3.c b/sysdeps/unix/sysv/linux/tst-clone3.c
index 1a1bfd4586..3e505807b6 100644
--- a/sysdeps/unix/sysv/linux/tst-clone3.c
+++ b/sysdeps/unix/sysv/linux/tst-clone3.c
@@ -20,10 +20,12 @@
 #include <sched.h>
 #include <signal.h>
 #include <unistd.h>
+#include <errno.h>
 #include <sys/syscall.h>
 #include <sys/wait.h>
 #include <sys/types.h>
 #include <linux/futex.h>
+#include <clone-internal.h>
 
 #include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
 #include <support/check.h>
@@ -78,23 +80,15 @@ do_test (void)
   pid_t ctid = CTID_INIT_VAL;
   pid_t tid;
 
-#ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
-		  /* tls */ NULL, &ctid);
-#else
-#if _STACK_GROWS_DOWN
-  tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
-	       /* tls */ NULL, &ctid);
-#elif _STACK_GROWS_UP
-  tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
-	       &ctid);
-#else
-#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-#endif
-#endif
+  struct clone_args clone_args =
+    {
+      .flags = clone_flags & ~CSIGNAL,
+      .exit_signal = clone_flags & CSIGNAL,
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+      .child_tid = (uintptr_t) &ctid,
+    };
+  tid = __clone_internal (&clone_args, sizeof (clone_args), f, NULL);
   if (tid == -1)
     FAIL_EXIT1 ("clone failed: %m");
 
diff --git a/sysdeps/unix/sysv/linux/tst-getpid1.c b/sysdeps/unix/sysv/linux/tst-getpid1.c
index 253ebf2e15..b78f56df4f 100644
--- a/sysdeps/unix/sysv/linux/tst-getpid1.c
+++ b/sysdeps/unix/sysv/linux/tst-getpid1.c
@@ -5,6 +5,7 @@
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <clone-internal.h>
 #include <stackinfo.h>
 
 #ifndef TEST_CLONE_FLAGS
@@ -42,21 +43,19 @@ do_test (void)
     }
 
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024] __attribute__ ((aligned));
-  pid_t p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
+# define STACK_SIZE 256 * 1024
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
-# if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
-# elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, TEST_CLONE_FLAGS, 0);
-# else
-#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-# endif
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+  struct clone_args clone_args =
+    {
+      .flags = TEST_CLONE_FLAGS & ~CSIGNAL,
+      .exit_signal = TEST_CLONE_FLAGS & CSIGNAL,
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+    };
+  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
   if (p == -1)
     {
       printf("clone failed: %m\n");
diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S
new file mode 100644
index 0000000000..dfceb93631
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S
@@ -0,0 +1,105 @@
+/* The clone3 syscall wrapper.  Linux/x86-64 version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* clone3() is even more special than fork() as it mucks with stacks
+   and invokes a function in the right context after its all over.  */
+
+#include <sysdep.h>
+#include <clone-offsets.h>
+
+/* The userland implementation is:
+   int clone3 (struct clone_args *cl_args, size_t size,
+	       int (*fn)(void *arg), void *arg);
+   the kernel entry is:
+   int clone3 (struct clone_args *cl_args, size_t size);
+
+   The parameters are passed in registers from userland:
+   rdi: cl_args
+   rsi: size
+   rdx: fn
+   rcx: arg
+
+   The kernel expects:
+   rax: system call number
+   rdi: cl_args
+   rsi: size  */
+
+        .text
+ENTRY (__clone3)
+	/* Sanity check arguments.  */
+	movq	$-EINVAL, %rax
+	testq	%rdi,%rdi		/* No NULL cl_args pointer.  */
+	jz	SYSCALL_ERROR_LABEL
+	testq	%rdx, %rdx		/* No NULL function pointer.  */
+	jz	SYSCALL_ERROR_LABEL
+
+	/* Load the new stack bottom into R8.  */
+	movq	CLONE_ARGS_STACK_OFFSET(%rdi), %r8
+
+	/* Load the new stack size into R9.  */
+	movq	CLONE_ARGS_STACK_SIZE_OFFSET(%rdi), %r9
+
+	/* Make room for 2 8-byte slots on the new stack.  */
+	subq	$16, %r8
+
+	/* Insert the argument onto the new stack.  */
+	movq	%rcx, 8(%r8, %r9)
+
+	/* Save the function pointer.  It will be popped off in the
+	   child.  */
+	movq	%rdx, 0(%r8, %r9)
+
+	/* Update the stack field passed to clone3.  */
+	movq	%r8, CLONE_ARGS_STACK_OFFSET(%rdi)
+
+	/* Do the system call.  */
+	movl	$SYS_ify(clone3), %eax
+
+	/* End FDE now, because in the child the unwind info will be
+	   wrong.  */
+	cfi_endproc;
+	syscall
+
+	test	%RAX_LP, %RAX_LP
+	jl	SYSCALL_ERROR_LABEL
+	jz	L(thread_start)
+
+	ret
+
+L(thread_start):
+	cfi_startproc;
+	/* Clearing frame pointer is insufficient, use CFI.  */
+	cfi_undefined (rip);
+	/* Clear the frame pointer.  The ABI suggests this be done, to mark
+	   the outermost frame obviously.  */
+	xorl	%ebp, %ebp
+
+	/* Set up arguments for the function call.  */
+	popq	%rax		/* Function to call.  */
+	popq	%rdi		/* Argument.  */
+	call	*%rax
+	/* Call exit with return value from function call. */
+	movq	%rax, %rdi
+	movl	$SYS_ify(exit), %eax
+	syscall
+	cfi_endproc;
+
+	cfi_startproc;
+PSEUDO_END (__clone3)
+
+libc_hidden_def (__clone3)
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add an internal wrapper for clone, clone2 and clone3
  2021-02-14 22:45 [PATCH] Add an internal wrapper for clone, clone2 and clone3 H.J. Lu via Libc-alpha
@ 2021-02-15 10:44 ` Florian Weimer via Libc-alpha
  2021-02-15 14:17   ` H.J. Lu via Libc-alpha
  2021-02-15 14:39   ` Adhemerval Zanella via Libc-alpha
  0 siblings, 2 replies; 14+ messages in thread
From: Florian Weimer via Libc-alpha @ 2021-02-15 10:44 UTC (permalink / raw)
  To: H.J. Lu via Libc-alpha

* H. J. Lu via Libc-alpha:

> The clone3 system call provides a superset of the functionality of clone
> and clone2.  It also provides a number of API improve ments, including
> the ability to specify the size of the child's stack area which can be
> used by kernel to compute the shadow stack size when allocating the
> shadow stack.  Add:
>
> extern int __clone_internal (struct clone_args *cl_args, size_t size,
> 			     int (*__fn) (void *__arg), void *__arg);
>
> to provide an abstract interface for clone, clone2 and clone3.
>
> 1. Simplify stack management for clone by passing stack base and size
> to __clone_internal.
> 2. Consolidate clone vs clone2 differences into a single file.
> 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> with ENOSYS, __clone_internal will fall back to clone or clone2.
> 4. Add the x86-64 clone3 wrapper.
> 5. Enable the public clone3 wrapper in the future after it has been
> added to add targets.

What do you think about providing a clone wrapper which reuses the
caller's stack?  That would be useful for a (v)fork-style clone.  This
variant could also be exported because the callback inherits a
semi-valid TCB in this case.

Thanks,
Florian
-- 
Red Hat GmbH, https://de.redhat.com/ , Registered seat: Grasbrunn,
Commercial register: Amtsgericht Muenchen, HRB 153243,
Managing Directors: Charles Cachera, Brian Klemm, Laurie Krebs, Michael O'Neill


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add an internal wrapper for clone, clone2 and clone3
  2021-02-15 10:44 ` Florian Weimer via Libc-alpha
@ 2021-02-15 14:17   ` H.J. Lu via Libc-alpha
  2021-02-15 14:27     ` Florian Weimer via Libc-alpha
  2021-02-15 14:39   ` Adhemerval Zanella via Libc-alpha
  1 sibling, 1 reply; 14+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-02-15 14:17 UTC (permalink / raw)
  To: Florian Weimer; +Cc: H.J. Lu via Libc-alpha

On Mon, Feb 15, 2021 at 2:43 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu via Libc-alpha:
>
> > The clone3 system call provides a superset of the functionality of clone
> > and clone2.  It also provides a number of API improve ments, including
> > the ability to specify the size of the child's stack area which can be
> > used by kernel to compute the shadow stack size when allocating the
> > shadow stack.  Add:
> >
> > extern int __clone_internal (struct clone_args *cl_args, size_t size,
> >                            int (*__fn) (void *__arg), void *__arg);
> >
> > to provide an abstract interface for clone, clone2 and clone3.
> >
> > 1. Simplify stack management for clone by passing stack base and size
> > to __clone_internal.
> > 2. Consolidate clone vs clone2 differences into a single file.
> > 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> > with ENOSYS, __clone_internal will fall back to clone or clone2.
> > 4. Add the x86-64 clone3 wrapper.
> > 5. Enable the public clone3 wrapper in the future after it has been
> > added to add targets.
>
> What do you think about providing a clone wrapper which reuses the
> caller's stack?  That would be useful for a (v)fork-style clone.  This
> variant could also be exported because the callback inherits a
> semi-valid TCB in this case.

Sure.  But all new clone wrappers should support clone3.


-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add an internal wrapper for clone, clone2 and clone3
  2021-02-15 14:17   ` H.J. Lu via Libc-alpha
@ 2021-02-15 14:27     ` Florian Weimer via Libc-alpha
  2021-02-15 14:44       ` H.J. Lu via Libc-alpha
  0 siblings, 1 reply; 14+ messages in thread
From: Florian Weimer via Libc-alpha @ 2021-02-15 14:27 UTC (permalink / raw)
  To: H.J. Lu; +Cc: H.J. Lu via Libc-alpha

* H. J. Lu:

> On Mon, Feb 15, 2021 at 2:43 AM Florian Weimer <fweimer@redhat.com> wrote:
>>
>> * H. J. Lu via Libc-alpha:
>>
>> > The clone3 system call provides a superset of the functionality of clone
>> > and clone2.  It also provides a number of API improve ments, including
>> > the ability to specify the size of the child's stack area which can be
>> > used by kernel to compute the shadow stack size when allocating the
>> > shadow stack.  Add:
>> >
>> > extern int __clone_internal (struct clone_args *cl_args, size_t size,
>> >                            int (*__fn) (void *__arg), void *__arg);
>> >
>> > to provide an abstract interface for clone, clone2 and clone3.
>> >
>> > 1. Simplify stack management for clone by passing stack base and size
>> > to __clone_internal.
>> > 2. Consolidate clone vs clone2 differences into a single file.
>> > 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
>> > with ENOSYS, __clone_internal will fall back to clone or clone2.
>> > 4. Add the x86-64 clone3 wrapper.
>> > 5. Enable the public clone3 wrapper in the future after it has been
>> > added to add targets.
>>
>> What do you think about providing a clone wrapper which reuses the
>> caller's stack?  That would be useful for a (v)fork-style clone.  This
>> variant could also be exported because the callback inherits a
>> semi-valid TCB in this case.
>
> Sure.  But all new clone wrappers should support clone3.

Through a single interface?  I'm not entirely convinced that this is
possible/desirable, given the differences in behavior.  There is also
the matter of seccomp filters, which have trouble reading userspace
memory.

Thanks,
Florian
-- 
Red Hat GmbH, https://de.redhat.com/ , Registered seat: Grasbrunn,
Commercial register: Amtsgericht Muenchen, HRB 153243,
Managing Directors: Charles Cachera, Brian Klemm, Laurie Krebs, Michael O'Neill


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add an internal wrapper for clone, clone2 and clone3
  2021-02-15 10:44 ` Florian Weimer via Libc-alpha
  2021-02-15 14:17   ` H.J. Lu via Libc-alpha
@ 2021-02-15 14:39   ` Adhemerval Zanella via Libc-alpha
  2021-03-04 18:40     ` [PATCH v2] " H.J. Lu via Libc-alpha
  1 sibling, 1 reply; 14+ messages in thread
From: Adhemerval Zanella via Libc-alpha @ 2021-02-15 14:39 UTC (permalink / raw)
  To: Florian Weimer, H.J. Lu via Libc-alpha



On 15/02/2021 07:44, Florian Weimer via Libc-alpha wrote:
> * H. J. Lu via Libc-alpha:
> 
>> The clone3 system call provides a superset of the functionality of clone
>> and clone2.  It also provides a number of API improve ments, including
>> the ability to specify the size of the child's stack area which can be
>> used by kernel to compute the shadow stack size when allocating the
>> shadow stack.  Add:
>>
>> extern int __clone_internal (struct clone_args *cl_args, size_t size,
>> 			     int (*__fn) (void *__arg), void *__arg);
>>
>> to provide an abstract interface for clone, clone2 and clone3.
>>
>> 1. Simplify stack management for clone by passing stack base and size
>> to __clone_internal.
>> 2. Consolidate clone vs clone2 differences into a single file.
>> 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
>> with ENOSYS, __clone_internal will fall back to clone or clone2.
>> 4. Add the x86-64 clone3 wrapper.
>> 5. Enable the public clone3 wrapper in the future after it has been
>> added to add targets.
> 
> What do you think about providing a clone wrapper which reuses the
> caller's stack?  That would be useful for a (v)fork-style clone.  This
> variant could also be exported because the callback inherits a
> semi-valid TCB in this case.
I have this one as a TODO to improve the posix_spawn. 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add an internal wrapper for clone, clone2 and clone3
  2021-02-15 14:27     ` Florian Weimer via Libc-alpha
@ 2021-02-15 14:44       ` H.J. Lu via Libc-alpha
  0 siblings, 0 replies; 14+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-02-15 14:44 UTC (permalink / raw)
  To: Florian Weimer; +Cc: H.J. Lu via Libc-alpha

On Mon, Feb 15, 2021 at 6:27 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu:
>
> > On Mon, Feb 15, 2021 at 2:43 AM Florian Weimer <fweimer@redhat.com> wrote:
> >>
> >> * H. J. Lu via Libc-alpha:
> >>
> >> > The clone3 system call provides a superset of the functionality of clone
> >> > and clone2.  It also provides a number of API improve ments, including
> >> > the ability to specify the size of the child's stack area which can be
> >> > used by kernel to compute the shadow stack size when allocating the
> >> > shadow stack.  Add:
> >> >
> >> > extern int __clone_internal (struct clone_args *cl_args, size_t size,
> >> >                            int (*__fn) (void *__arg), void *__arg);
> >> >
> >> > to provide an abstract interface for clone, clone2 and clone3.
> >> >
> >> > 1. Simplify stack management for clone by passing stack base and size
> >> > to __clone_internal.
> >> > 2. Consolidate clone vs clone2 differences into a single file.
> >> > 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> >> > with ENOSYS, __clone_internal will fall back to clone or clone2.
> >> > 4. Add the x86-64 clone3 wrapper.
> >> > 5. Enable the public clone3 wrapper in the future after it has been
> >> > added to add targets.
> >>
> >> What do you think about providing a clone wrapper which reuses the
> >> caller's stack?  That would be useful for a (v)fork-style clone.  This
> >> variant could also be exported because the callback inherits a
> >> semi-valid TCB in this case.
> >
> > Sure.  But all new clone wrappers should support clone3.
>
> Through a single interface?  I'm not entirely convinced that this is

Not a single public interface.  If we want to do it, we should start with
the clone3 wrapper.

> possible/desirable, given the differences in behavior.  There is also
> the matter of seccomp filters, which have trouble reading userspace
> memory.
>

-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2] Add an internal wrapper for clone, clone2 and clone3
  2021-02-15 14:39   ` Adhemerval Zanella via Libc-alpha
@ 2021-03-04 18:40     ` H.J. Lu via Libc-alpha
  2021-03-04 18:50       ` H.J. Lu via Libc-alpha
  0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-03-04 18:40 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Florian Weimer, H.J. Lu via Libc-alpha

On Mon, Feb 15, 2021 at 6:56 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 15/02/2021 07:44, Florian Weimer via Libc-alpha wrote:
> > * H. J. Lu via Libc-alpha:
> >
> >> The clone3 system call provides a superset of the functionality of clone
> >> and clone2.  It also provides a number of API improve ments, including
> >> the ability to specify the size of the child's stack area which can be
> >> used by kernel to compute the shadow stack size when allocating the
> >> shadow stack.  Add:
> >>
> >> extern int __clone_internal (struct clone_args *cl_args, size_t size,
> >>                           int (*__fn) (void *__arg), void *__arg);
> >>
> >> to provide an abstract interface for clone, clone2 and clone3.
> >>
> >> 1. Simplify stack management for clone by passing stack base and size
> >> to __clone_internal.
> >> 2. Consolidate clone vs clone2 differences into a single file.
> >> 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> >> with ENOSYS, __clone_internal will fall back to clone or clone2.
> >> 4. Add the x86-64 clone3 wrapper.
> >> 5. Enable the public clone3 wrapper in the future after it has been
> >> added to add targets.
> >
> > What do you think about providing a clone wrapper which reuses the
> > caller's stack?  That would be useful for a (v)fork-style clone.  This
> > variant could also be exported because the callback inherits a
> > semi-valid TCB in this case.
> I have this one as a TODO to improve the posix_spawn.

Here is the v2 patch:

1. Update the stack size field, instead of the stack field, passed to
clone3.

OK for master?

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Add an internal wrapper for clone, clone2 and clone3
  2021-03-04 18:40     ` [PATCH v2] " H.J. Lu via Libc-alpha
@ 2021-03-04 18:50       ` H.J. Lu via Libc-alpha
  2021-03-05  8:32         ` Christian Brauner
  0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-03-04 18:50 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Florian Weimer, H.J. Lu via Libc-alpha

[-- Attachment #1: Type: text/plain, Size: 1832 bytes --]

On Thu, Mar 4, 2021 at 10:40 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Feb 15, 2021 at 6:56 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> >
> >
> > On 15/02/2021 07:44, Florian Weimer via Libc-alpha wrote:
> > > * H. J. Lu via Libc-alpha:
> > >
> > >> The clone3 system call provides a superset of the functionality of clone
> > >> and clone2.  It also provides a number of API improve ments, including
> > >> the ability to specify the size of the child's stack area which can be
> > >> used by kernel to compute the shadow stack size when allocating the
> > >> shadow stack.  Add:
> > >>
> > >> extern int __clone_internal (struct clone_args *cl_args, size_t size,
> > >>                           int (*__fn) (void *__arg), void *__arg);
> > >>
> > >> to provide an abstract interface for clone, clone2 and clone3.
> > >>
> > >> 1. Simplify stack management for clone by passing stack base and size
> > >> to __clone_internal.
> > >> 2. Consolidate clone vs clone2 differences into a single file.
> > >> 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> > >> with ENOSYS, __clone_internal will fall back to clone or clone2.
> > >> 4. Add the x86-64 clone3 wrapper.
> > >> 5. Enable the public clone3 wrapper in the future after it has been
> > >> added to add targets.
> > >
> > > What do you think about providing a clone wrapper which reuses the
> > > caller's stack?  That would be useful for a (v)fork-style clone.  This
> > > variant could also be exported because the callback inherits a
> > > semi-valid TCB in this case.
> > I have this one as a TODO to improve the posix_spawn.
>
> Here is the v2 patch:
>
> 1. Update the stack size field, instead of the stack field, passed to
> clone3.
>
> OK for master?

Really add the patch this time :-).

-- 
H.J.

[-- Attachment #2: v2-0001-Add-an-internal-wrapper-for-clone-clone2-and-clon.patch --]
[-- Type: text/x-patch, Size: 29563 bytes --]

From 2d48b40766bc870d44ee31db2e27843bb2376330 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 13 Feb 2021 11:47:46 -0800
Subject: [PATCH v2] Add an internal wrapper for clone, clone2 and clone3

The clone3 system call provides a superset of the functionality of clone
and clone2.  It also provides a number of API improve ments, including
the ability to specify the size of the child's stack area which can be
used by kernel to compute the shadow stack size when allocating the
shadow stack.  Add:

extern int __clone_internal (struct clone_args *cl_args, size_t size,
			     int (*__fn) (void *__arg), void *__arg);

to provide an abstract interface for clone, clone2 and clone3.

1. Simplify stack management for clone by passing stack base and size
to __clone_internal.
2. Consolidate clone vs clone2 differences into a single file.
3. Use only __clone_internal to clone a thread.  If clone3 returns -1
with ENOSYS, __clone_internal will fall back to clone or clone2.
4. Add the x86-64 clone3 wrapper.
5. Enable the public clone3 wrapper in the future after it has been
added to all targets.

Tested with build-many-glibcs.py.  The x86-64 clone3 wrapper and the
i686 clone fallback are tested under 5.10.15 kernel.

Changes in v2:

1. Update the stack size field, instead of the stack field, passed to
clone3.
---
 include/sched.h                           |   3 +
 nptl/allocatestack.c                      |  59 ++----------
 nptl/createthread.c                       |   3 +-
 nptl/pthread_create.c                     |  17 ++--
 sysdeps/generic/clone-internal.h          |  49 ++++++++++
 sysdeps/unix/sysv/linux/Makefile          |   4 +-
 sysdeps/unix/sysv/linux/Versions          |   1 +
 sysdeps/unix/sysv/linux/clone-internal.c  |  61 +++++++++++++
 sysdeps/unix/sysv/linux/clone-offsets.sym |   9 ++
 sysdeps/unix/sysv/linux/clone3.c          |  31 +++++++
 sysdeps/unix/sysv/linux/createthread.c    |  24 ++---
 sysdeps/unix/sysv/linux/spawni.c          |  26 ++----
 sysdeps/unix/sysv/linux/tst-align-clone.c |  23 +++--
 sysdeps/unix/sysv/linux/tst-clone.c       |  12 +--
 sysdeps/unix/sysv/linux/tst-clone2.c      |  25 +++---
 sysdeps/unix/sysv/linux/tst-clone3.c      |  28 +++---
 sysdeps/unix/sysv/linux/tst-getpid1.c     |  25 +++---
 sysdeps/unix/sysv/linux/x86_64/clone3.S   | 105 ++++++++++++++++++++++
 18 files changed, 347 insertions(+), 158 deletions(-)
 create mode 100644 sysdeps/generic/clone-internal.h
 create mode 100644 sysdeps/unix/sysv/linux/clone-internal.c
 create mode 100644 sysdeps/unix/sysv/linux/clone-offsets.sym
 create mode 100644 sysdeps/unix/sysv/linux/clone3.c
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/clone3.S

diff --git a/include/sched.h b/include/sched.h
index b0bf971c93..28dd99a571 100644
--- a/include/sched.h
+++ b/include/sched.h
@@ -32,5 +32,8 @@ libc_hidden_proto (__clone2)
    and Hurd doesn't have it.  */
 extern int __getcpu (unsigned int *, unsigned int *);
 libc_hidden_proto (__getcpu)
+
+# include <clone-internal.h>
+libc_hidden_proto (__clone_internal)
 #endif
 #endif
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index 149b999603..73c7f33a00 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -34,47 +34,6 @@
 #include <stack-aliasing.h>
 
 
-#ifndef NEED_SEPARATE_REGISTER_STACK
-
-/* Most architectures have exactly one stack pointer.  Some have more.  */
-# define STACK_VARIABLES void *stackaddr = NULL
-
-/* How to pass the values to the 'create_thread' function.  */
-# define STACK_VARIABLES_ARGS stackaddr
-
-/* How to declare function which gets there parameters.  */
-# define STACK_VARIABLES_PARMS void *stackaddr
-
-/* How to declare allocate_stack.  */
-# define ALLOCATE_STACK_PARMS void **stack
-
-/* This is how the function is called.  We do it this way to allow
-   other variants of the function to have more parameters.  */
-# define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
-
-#else
-
-/* We need two stacks.  The kernel will place them but we have to tell
-   the kernel about the size of the reserved address space.  */
-# define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
-
-/* How to pass the values to the 'create_thread' function.  */
-# define STACK_VARIABLES_ARGS stackaddr, stacksize
-
-/* How to declare function which gets there parameters.  */
-# define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
-
-/* How to declare allocate_stack.  */
-# define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
-
-/* This is how the function is called.  We do it this way to allow
-   other variants of the function to have more parameters.  */
-# define ALLOCATE_STACK(attr, pd) \
-  allocate_stack (attr, pd, &stackaddr, &stacksize)
-
-#endif
-
-
 /* Default alignment of stack.  */
 #ifndef STACK_ALIGN
 # define STACK_ALIGN __alignof__ (long double)
@@ -399,7 +358,7 @@ advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
    PDP must be non-NULL.  */
 static int
 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
-		ALLOCATE_STACK_PARMS)
+		void **stack, size_t *stacksize)
 {
   struct pthread *pd;
   size_t size;
@@ -760,25 +719,17 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
   /* We place the thread descriptor at the end of the stack.  */
   *pdp = pd;
 
-#if _STACK_GROWS_DOWN
   void *stacktop;
 
-# if TLS_TCB_AT_TP
+#if TLS_TCB_AT_TP
   /* The stack begins before the TCB and the static TLS block.  */
   stacktop = ((char *) (pd + 1) - __static_tls_size);
-# elif TLS_DTV_AT_TP
+#elif TLS_DTV_AT_TP
   stacktop = (char *) (pd - 1);
-# endif
+#endif
 
-# ifdef NEED_SEPARATE_REGISTER_STACK
-  *stack = pd->stackblock;
-  *stacksize = stacktop - *stack;
-# else
-  *stack = stacktop;
-# endif
-#else
+  *stacksize = stacktop - pd->stackblock;
   *stack = pd->stackblock;
-#endif
 
   return 0;
 }
diff --git a/nptl/createthread.c b/nptl/createthread.c
index 46943b33fe..2ac83111ec 100644
--- a/nptl/createthread.c
+++ b/nptl/createthread.c
@@ -25,7 +25,8 @@
 
 static int
 create_thread (struct pthread *pd, const struct pthread_attr *attr,
-	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
+	       bool *stopped_start, void *stackaddr, size_t stacksize,
+	       bool *thread_ran)
 {
   /* If the implementation needs to do some tweaks to the thread after
      it has been created at the OS level, it can set STOPPED_START here.  */
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 6c645aff48..2cc3d25697 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -201,8 +201,8 @@ unsigned int __nptl_nthreads = 1;
    be set to true iff the thread actually started up and then got
    canceled before calling user code (*PD->start_routine).  */
 static int create_thread (struct pthread *pd, const struct pthread_attr *attr,
-			  bool *stopped_start, STACK_VARIABLES_PARMS,
-			  bool *thread_ran);
+			  bool *stopped_start, void *stackaddr,
+			  size_t stacksize, bool *thread_ran);
 
 #include <createthread.c>
 
@@ -621,7 +621,8 @@ int
 __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 		      void *(*start_routine) (void *), void *arg)
 {
-  STACK_VARIABLES;
+  void *stackaddr = NULL;
+  size_t stacksize = 0;
 
   /* Avoid a data race in the multi-threaded case.  */
   if (__libc_single_threaded)
@@ -641,7 +642,7 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
     }
 
   struct pthread *pd = NULL;
-  int err = ALLOCATE_STACK (iattr, &pd);
+  int err = allocate_stack (iattr, &pd, &stackaddr, &stacksize);
   int retval = 0;
 
   if (__glibc_unlikely (err != 0))
@@ -786,8 +787,8 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 
       /* We always create the thread stopped at startup so we can
 	 notify the debugger.  */
-      retval = create_thread (pd, iattr, &stopped_start,
-			      STACK_VARIABLES_ARGS, &thread_ran);
+      retval = create_thread (pd, iattr, &stopped_start, stackaddr,
+			      stacksize, &thread_ran);
       if (retval == 0)
 	{
 	  /* We retain ownership of PD until (a) (see CONCURRENCY NOTES
@@ -818,8 +819,8 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 	}
     }
   else
-    retval = create_thread (pd, iattr, &stopped_start,
-			    STACK_VARIABLES_ARGS, &thread_ran);
+    retval = create_thread (pd, iattr, &stopped_start, stackaddr,
+			    stacksize, &thread_ran);
 
   /* Return to the previous signal mask, after creating the new
      thread.  */
diff --git a/sysdeps/generic/clone-internal.h b/sysdeps/generic/clone-internal.h
new file mode 100644
index 0000000000..1c1f997ca5
--- /dev/null
+++ b/sysdeps/generic/clone-internal.h
@@ -0,0 +1,49 @@
+/* The internal wrapper of clone and clone3.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <stddef.h>
+
+struct clone_args
+{
+  uint64_t flags;	 /* Flags bit mask.  */
+  uint64_t pidfd;	 /* Where to store PID file descriptor
+			    (pid_t *).  */
+  uint64_t child_tid;	 /* Where to store child TID, in child's memory
+			    (pid_t *).  */
+  uint64_t parent_tid;	 /* Where to store child TID, in parent's memory
+			    (int *). */
+  uint64_t exit_signal;	 /* Signal to deliver to parent on child
+			    termination */
+  uint64_t stack;	 /* The lowest address of stack.  */
+  uint64_t stack_size;	 /* Size of stack.  */
+  uint64_t tls;		 /* Location of new TLS.  */
+  uint64_t set_tid;	 /* Pointer to a pid_t array
+			    (since Linux 5.5).  */
+  uint64_t set_tid_size; /* Number of elements in set_tid
+			    (since Linux 5.5). */
+  uint64_t cgroup;	 /* File descriptor for target cgroup
+			    of child (since Linux 5.7).  */
+};
+
+extern int __clone3 (struct clone_args *cl_args, size_t size,
+		     int (*__fn) (void *__arg), void *__arg)
+  __attribute__ ((visibility ("hidden")));
+
+/* The internal wrapper of clone and clone3.  */
+extern __typeof (__clone3) __clone_internal;
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 51e28b97ac..499f2435eb 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -54,6 +54,8 @@ CFLAGS-malloc.c += -DMORECORE_CLEARS=2
 endif
 
 ifeq ($(subdir),misc)
+gen-as-const-headers += clone-offsets.sym
+
 sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
 		   setfsuid setfsgid epoll_pwait signalfd \
 		   eventfd eventfd_read eventfd_write prlimit \
@@ -64,7 +66,7 @@ sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
 		   time64-support pselect32 \
 		   xstat fxstat lxstat xstat64 fxstat64 lxstat64 \
 		   fxstatat fxstatat64 \
-		   xmknod xmknodat
+		   xmknod xmknodat clone3 clone-internal
 
 CFLAGS-gethostid.c = -fexceptions
 CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables
diff --git a/sysdeps/unix/sysv/linux/Versions b/sysdeps/unix/sysv/linux/Versions
index c35f783e2a..9bb13d593f 100644
--- a/sysdeps/unix/sysv/linux/Versions
+++ b/sysdeps/unix/sysv/linux/Versions
@@ -179,5 +179,6 @@ libc {
     __sigtimedwait;
     # functions used by nscd
     __netlink_assert_response;
+    __clone_internal;
   }
 }
diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
new file mode 100644
index 0000000000..7d755cd97f
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone-internal.c
@@ -0,0 +1,61 @@
+/* The internal wrapper of clone and clone3.  Linux version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <sched.h>
+#include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
+
+int
+__clone_internal (struct clone_args *cl_args, size_t size,
+		  int (*fn)(void *arg), void *arg)
+{
+  /* Try clone3 first.  */
+  int ret = __clone3 (cl_args, size, fn, arg);
+  if (ret == -1 && errno == ENOSYS)
+    {
+      /* NB: Must clear errno since errno may be checked against non-zero
+         return value.  */
+      __set_errno (0);
+      /* Map clone3 arguments to clone arguments.  */
+      int flags = cl_args->flags | cl_args->exit_signal;
+      void *stack = (void *) (uintptr_t) cl_args->stack;
+
+#ifdef __ia64__
+      ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,
+		      flags, arg,
+		      (void *) (uintptr_t) cl_args->parent_tid,
+		      (void *) (uintptr_t) cl_args->tls,
+		      (void *) (uintptr_t) cl_args->child_tid);
+#else
+# if !_STACK_GROWS_DOWN && !_STACK_GROWS_UP
+#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
+# endif
+
+# if _STACK_GROWS_DOWN
+      stack += cl_args->stack_size;
+# endif
+      ret = __clone (fn, stack, flags, arg,
+		     (void *) (uintptr_t) cl_args->parent_tid,
+		     (void *) (uintptr_t) cl_args->tls,
+		     (void *) (uintptr_t) cl_args->child_tid);
+#endif
+    }
+  return ret;
+}
+
+libc_hidden_def (__clone_internal)
diff --git a/sysdeps/unix/sysv/linux/clone-offsets.sym b/sysdeps/unix/sysv/linux/clone-offsets.sym
new file mode 100644
index 0000000000..38d447bcf3
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone-offsets.sym
@@ -0,0 +1,9 @@
+#include <stddef.h>
+#include <sched.h>
+
+--
+
+#define clone_args_offset(member) offsetof (struct clone_args, member)
+
+CLONE_ARGS_STACK_OFFSET		clone_args_offset(stack)
+CLONE_ARGS_STACK_SIZE_OFFSET	clone_args_offset(stack_size)
diff --git a/sysdeps/unix/sysv/linux/clone3.c b/sysdeps/unix/sysv/linux/clone3.c
new file mode 100644
index 0000000000..c7d88e7af7
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone3.c
@@ -0,0 +1,31 @@
+/* The clone3 syscall wrapper.  Linux version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <sched.h>
+
+int
+__clone3 (struct clone_args *cl_args, size_t size,
+	  int (*fn)(void *arg), void *arg)
+{
+  if (cl_args == NULL || fn == NULL)
+    __set_errno (EINVAL);
+  else
+    __set_errno (ENOSYS);
+  return -1;
+}
diff --git a/sysdeps/unix/sysv/linux/createthread.c b/sysdeps/unix/sysv/linux/createthread.c
index bc3409b326..599c172de8 100644
--- a/sysdeps/unix/sysv/linux/createthread.c
+++ b/sysdeps/unix/sysv/linux/createthread.c
@@ -28,12 +28,6 @@
 
 #include <arch-fork.h>
 
-#ifdef __NR_clone2
-# define ARCH_CLONE __clone2
-#else
-# define ARCH_CLONE __clone
-#endif
-
 /* See the comments in pthread_create.c for the requirements for these
    two macros and the create_thread function.  */
 
@@ -47,7 +41,8 @@ static int start_thread (void *arg) __attribute__ ((noreturn));
 
 static int
 create_thread (struct pthread *pd, const struct pthread_attr *attr,
-	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
+	       bool *stopped_start, void *stackaddr, size_t stacksize,
+	       bool *thread_ran)
 {
   /* Determine whether the newly created threads has to be started
      stopped since we have to set the scheduling parameters or set the
@@ -100,9 +95,18 @@ create_thread (struct pthread *pd, const struct pthread_attr *attr,
 
   TLS_DEFINE_INIT_TP (tp, pd);
 
-  if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
-				    clone_flags, pd, &pd->tid, tp, &pd->tid)
-			== -1))
+  struct clone_args args =
+    {
+      .flags = clone_flags,
+      .pidfd = (uintptr_t) &pd->tid,
+      .parent_tid = (uintptr_t) &pd->tid,
+      .child_tid = (uintptr_t) &pd->tid,
+      .stack = (uintptr_t) stackaddr,
+      .stack_size = stacksize,
+      .tls = (uintptr_t) tp,
+    };
+  int ret = __clone_internal (&args, sizeof (args), &start_thread, pd);
+  if (__glibc_unlikely (ret == -1))
     return errno;
 
   /* It's started now, so if we fail below, we'll have to cancel it
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index b53b81b8fc..c63e2b65f1 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -59,21 +59,6 @@
    normal program exit with the exit code 127.  */
 #define SPAWN_ERROR	127
 
-#ifdef __ia64__
-# define CLONE(__fn, __stackbase, __stacksize, __flags, __args) \
-  __clone2 (__fn, __stackbase, __stacksize, __flags, __args, 0, 0, 0)
-#else
-# define CLONE(__fn, __stack, __stacksize, __flags, __args) \
-  __clone (__fn, __stack, __flags, __args)
-#endif
-
-/* Since ia64 wants the stackbase w/clone2, re-use the grows-up macro.  */
-#if _STACK_GROWS_UP || defined (__ia64__)
-# define STACK(__stack, __stack_size) (__stack)
-#elif _STACK_GROWS_DOWN
-# define STACK(__stack, __stack_size) (__stack + __stack_size)
-#endif
-
 
 struct posix_spawn_args
 {
@@ -379,8 +364,15 @@ __spawnix (pid_t * pid, const char *file,
      need for CLONE_SETTLS.  Although parent and child share the same TLS
      namespace, there will be no concurrent access for TLS variables (errno
      for instance).  */
-  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
-		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
+  struct clone_args clone_args =
+    {
+      .flags = CLONE_VM | CLONE_VFORK,
+      .exit_signal = SIGCHLD,
+      .stack = (uintptr_t) stack,
+      .stack_size = stack_size,
+    };
+  new_pid = __clone_internal (&clone_args, sizeof (clone_args),
+			      __spawni_child, &args);
 
   /* It needs to collect the case where the auxiliary process was created
      but failed to execute the file (due either any preparation step or
diff --git a/sysdeps/unix/sysv/linux/tst-align-clone.c b/sysdeps/unix/sysv/linux/tst-align-clone.c
index 6ace61bac3..5f00b8ee4a 100644
--- a/sysdeps/unix/sysv/linux/tst-align-clone.c
+++ b/sysdeps/unix/sysv/linux/tst-align-clone.c
@@ -23,6 +23,7 @@
 #include <sys/wait.h>
 #include <unistd.h>
 #include <tst-stack-align.h>
+#include <clone-internal.h>
 #include <stackinfo.h>
 
 static int
@@ -49,21 +50,17 @@ do_test (void)
     ok = false;
 
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024];
-  pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
+# define STACK_SIZE 256 * 1024
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
-# if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), 0, 0);
-# elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, 0, 0);
-# else
-#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-# endif
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+  struct clone_args clone_args =
+    {
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+    };
+  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
   if (p == -1)
     {
       printf("clone failed: %m\n");
diff --git a/sysdeps/unix/sysv/linux/tst-clone.c b/sysdeps/unix/sysv/linux/tst-clone.c
index e6ae0106ef..2865182f05 100644
--- a/sysdeps/unix/sysv/linux/tst-clone.c
+++ b/sysdeps/unix/sysv/linux/tst-clone.c
@@ -22,11 +22,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <sched.h>
-
-#ifdef __ia64__
-extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		     size_t __child_stack_size, int __flags, void *__arg, ...);
-#endif
+#include <clone-internal.h>
 
 int child_fn(void *arg)
 {
@@ -39,11 +35,7 @@ do_test (void)
 {
   int result;
 
-#ifdef __ia64__
-  result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
-#else
-  result = clone (child_fn, NULL, 0, NULL);
-#endif
+  result = __clone_internal (NULL, 0, child_fn, NULL);
 
   if (errno != EINVAL || result != -1)
     {
diff --git a/sysdeps/unix/sysv/linux/tst-clone2.c b/sysdeps/unix/sysv/linux/tst-clone2.c
index ce36c70c0d..fda77ac48b 100644
--- a/sysdeps/unix/sysv/linux/tst-clone2.c
+++ b/sysdeps/unix/sysv/linux/tst-clone2.c
@@ -26,9 +26,11 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <errno.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <sys/syscall.h>
+#include <clone-internal.h>
 
 #include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
 
@@ -70,23 +72,18 @@ do_test (void)
   if (pipe2 (pipefd, O_CLOEXEC))
     FAIL_EXIT1 ("pipe failed: %m");
 
-  int clone_flags = 0;
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024] __attribute__ ((aligned));
-  pid_t p = __clone2 (f, st, sizeof (st), clone_flags, 0);
+# define STACK_SIZE 256 * 1024
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
-#if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), clone_flags, 0);
-#elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, clone_flags, 0);
-#else
-#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-#endif
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+  struct clone_args clone_args =
+    {
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+    };
+  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
 
   close (pipefd[1]);
 
diff --git a/sysdeps/unix/sysv/linux/tst-clone3.c b/sysdeps/unix/sysv/linux/tst-clone3.c
index 1a1bfd4586..3e505807b6 100644
--- a/sysdeps/unix/sysv/linux/tst-clone3.c
+++ b/sysdeps/unix/sysv/linux/tst-clone3.c
@@ -20,10 +20,12 @@
 #include <sched.h>
 #include <signal.h>
 #include <unistd.h>
+#include <errno.h>
 #include <sys/syscall.h>
 #include <sys/wait.h>
 #include <sys/types.h>
 #include <linux/futex.h>
+#include <clone-internal.h>
 
 #include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
 #include <support/check.h>
@@ -78,23 +80,15 @@ do_test (void)
   pid_t ctid = CTID_INIT_VAL;
   pid_t tid;
 
-#ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
-		  /* tls */ NULL, &ctid);
-#else
-#if _STACK_GROWS_DOWN
-  tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
-	       /* tls */ NULL, &ctid);
-#elif _STACK_GROWS_UP
-  tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
-	       &ctid);
-#else
-#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-#endif
-#endif
+  struct clone_args clone_args =
+    {
+      .flags = clone_flags & ~CSIGNAL,
+      .exit_signal = clone_flags & CSIGNAL,
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+      .child_tid = (uintptr_t) &ctid,
+    };
+  tid = __clone_internal (&clone_args, sizeof (clone_args), f, NULL);
   if (tid == -1)
     FAIL_EXIT1 ("clone failed: %m");
 
diff --git a/sysdeps/unix/sysv/linux/tst-getpid1.c b/sysdeps/unix/sysv/linux/tst-getpid1.c
index 253ebf2e15..b78f56df4f 100644
--- a/sysdeps/unix/sysv/linux/tst-getpid1.c
+++ b/sysdeps/unix/sysv/linux/tst-getpid1.c
@@ -5,6 +5,7 @@
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <clone-internal.h>
 #include <stackinfo.h>
 
 #ifndef TEST_CLONE_FLAGS
@@ -42,21 +43,19 @@ do_test (void)
     }
 
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024] __attribute__ ((aligned));
-  pid_t p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
+# define STACK_SIZE 256 * 1024
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
-# if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
-# elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, TEST_CLONE_FLAGS, 0);
-# else
-#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-# endif
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+  struct clone_args clone_args =
+    {
+      .flags = TEST_CLONE_FLAGS & ~CSIGNAL,
+      .exit_signal = TEST_CLONE_FLAGS & CSIGNAL,
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+    };
+  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
   if (p == -1)
     {
       printf("clone failed: %m\n");
diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S
new file mode 100644
index 0000000000..4d99b4b881
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S
@@ -0,0 +1,105 @@
+/* The clone3 syscall wrapper.  Linux/x86-64 version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* clone3() is even more special than fork() as it mucks with stacks
+   and invokes a function in the right context after its all over.  */
+
+#include <sysdep.h>
+#include <clone-offsets.h>
+
+/* The userland implementation is:
+   int clone3 (struct clone_args *cl_args, size_t size,
+	       int (*fn)(void *arg), void *arg);
+   the kernel entry is:
+   int clone3 (struct clone_args *cl_args, size_t size);
+
+   The parameters are passed in registers from userland:
+   rdi: cl_args
+   rsi: size
+   rdx: fn
+   rcx: arg
+
+   The kernel expects:
+   rax: system call number
+   rdi: cl_args
+   rsi: size  */
+
+        .text
+ENTRY (__clone3)
+	/* Sanity check arguments.  */
+	movq	$-EINVAL, %rax
+	testq	%rdi, %rdi		/* No NULL cl_args pointer.  */
+	jz	SYSCALL_ERROR_LABEL
+	testq	%rdx, %rdx		/* No NULL function pointer.  */
+	jz	SYSCALL_ERROR_LABEL
+
+	/* Load the new stack size into R9.  */
+	movq	CLONE_ARGS_STACK_SIZE_OFFSET(%rdi), %r9
+
+	/* Load the new stack bottom into R8.  */
+	movq	CLONE_ARGS_STACK_OFFSET(%rdi), %r8
+
+	/* Make room for 2 8-byte slots on the new stack.  */
+	subq	$16, %r9
+
+	/* Insert the argument onto the new stack.  */
+	movq	%rcx, 8(%r8, %r9)
+
+	/* Save the function pointer.  It will be popped off in the
+	   child.  */
+	movq	%rdx, (%r8, %r9)
+
+	/* Update the stack size field passed to clone3.  */
+	movq	%r9, CLONE_ARGS_STACK_SIZE_OFFSET(%rdi)
+
+	/* Do the system call.  */
+	movl	$SYS_ify(clone3), %eax
+
+	/* End FDE now, because in the child the unwind info will be
+	   wrong.  */
+	cfi_endproc;
+	syscall
+
+	test	%RAX_LP, %RAX_LP
+	jl	SYSCALL_ERROR_LABEL
+	jz	L(thread_start)
+
+	ret
+
+L(thread_start):
+	cfi_startproc;
+	/* Clearing frame pointer is insufficient, use CFI.  */
+	cfi_undefined (rip);
+	/* Clear the frame pointer.  The ABI suggests this be done, to mark
+	   the outermost frame obviously.  */
+	xorl	%ebp, %ebp
+
+	/* Set up arguments for the function call.  */
+	popq	%rax		/* Function to call.  */
+	popq	%rdi		/* Argument.  */
+	call	*%rax
+	/* Call exit with return value from function call. */
+	movq	%rax, %rdi
+	movl	$SYS_ify(exit), %eax
+	syscall
+	cfi_endproc;
+
+	cfi_startproc;
+PSEUDO_END (__clone3)
+
+libc_hidden_def (__clone3)
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Add an internal wrapper for clone, clone2 and clone3
  2021-03-04 18:50       ` H.J. Lu via Libc-alpha
@ 2021-03-05  8:32         ` Christian Brauner
  2021-03-05 16:59           ` [PATCH v3] " H.J. Lu via Libc-alpha
  0 siblings, 1 reply; 14+ messages in thread
From: Christian Brauner @ 2021-03-05  8:32 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Florian Weimer, H.J. Lu via Libc-alpha

On Thu, Mar 04, 2021 at 10:50:29AM -0800, H.J. Lu via Libc-alpha wrote:
> On Thu, Mar 4, 2021 at 10:40 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Feb 15, 2021 at 6:56 AM Adhemerval Zanella via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > >
> > >
> > > On 15/02/2021 07:44, Florian Weimer via Libc-alpha wrote:
> > > > * H. J. Lu via Libc-alpha:
> > > >
> > > >> The clone3 system call provides a superset of the functionality of clone
> > > >> and clone2.  It also provides a number of API improve ments, including
> > > >> the ability to specify the size of the child's stack area which can be
> > > >> used by kernel to compute the shadow stack size when allocating the
> > > >> shadow stack.  Add:
> > > >>
> > > >> extern int __clone_internal (struct clone_args *cl_args, size_t size,
> > > >>                           int (*__fn) (void *__arg), void *__arg);
> > > >>
> > > >> to provide an abstract interface for clone, clone2 and clone3.
> > > >>
> > > >> 1. Simplify stack management for clone by passing stack base and size
> > > >> to __clone_internal.
> > > >> 2. Consolidate clone vs clone2 differences into a single file.
> > > >> 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> > > >> with ENOSYS, __clone_internal will fall back to clone or clone2.
> > > >> 4. Add the x86-64 clone3 wrapper.
> > > >> 5. Enable the public clone3 wrapper in the future after it has been
> > > >> added to add targets.
> > > >
> > > > What do you think about providing a clone wrapper which reuses the
> > > > caller's stack?  That would be useful for a (v)fork-style clone.  This
> > > > variant could also be exported because the callback inherits a
> > > > semi-valid TCB in this case.
> > > I have this one as a TODO to improve the posix_spawn.
> >
> > Here is the v2 patch:
> >
> > 1. Update the stack size field, instead of the stack field, passed to
> > clone3.
> >
> > OK for master?
> 
> Really add the patch this time :-).
> 
> -- 
> H.J.

Just a small comment.

> From 2d48b40766bc870d44ee31db2e27843bb2376330 Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Sat, 13 Feb 2021 11:47:46 -0800
> Subject: [PATCH v2] Add an internal wrapper for clone, clone2 and clone3
> 
> The clone3 system call provides a superset of the functionality of clone
> and clone2.  It also provides a number of API improve ments, including
> the ability to specify the size of the child's stack area which can be
> used by kernel to compute the shadow stack size when allocating the
> shadow stack.  Add:
> 
> extern int __clone_internal (struct clone_args *cl_args, size_t size,
> 			     int (*__fn) (void *__arg), void *__arg);
> 
> to provide an abstract interface for clone, clone2 and clone3.
> 
> 1. Simplify stack management for clone by passing stack base and size
> to __clone_internal.
> 2. Consolidate clone vs clone2 differences into a single file.
> 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> with ENOSYS, __clone_internal will fall back to clone or clone2.
> 4. Add the x86-64 clone3 wrapper.
> 5. Enable the public clone3 wrapper in the future after it has been
> added to all targets.
> 
> Tested with build-many-glibcs.py.  The x86-64 clone3 wrapper and the
> i686 clone fallback are tested under 5.10.15 kernel.
> 
> Changes in v2:
> 
> 1. Update the stack size field, instead of the stack field, passed to
> clone3.
> ---
>  include/sched.h                           |   3 +
>  nptl/allocatestack.c                      |  59 ++----------
>  nptl/createthread.c                       |   3 +-
>  nptl/pthread_create.c                     |  17 ++--
>  sysdeps/generic/clone-internal.h          |  49 ++++++++++
>  sysdeps/unix/sysv/linux/Makefile          |   4 +-
>  sysdeps/unix/sysv/linux/Versions          |   1 +
>  sysdeps/unix/sysv/linux/clone-internal.c  |  61 +++++++++++++
>  sysdeps/unix/sysv/linux/clone-offsets.sym |   9 ++
>  sysdeps/unix/sysv/linux/clone3.c          |  31 +++++++
>  sysdeps/unix/sysv/linux/createthread.c    |  24 ++---
>  sysdeps/unix/sysv/linux/spawni.c          |  26 ++----
>  sysdeps/unix/sysv/linux/tst-align-clone.c |  23 +++--
>  sysdeps/unix/sysv/linux/tst-clone.c       |  12 +--
>  sysdeps/unix/sysv/linux/tst-clone2.c      |  25 +++---
>  sysdeps/unix/sysv/linux/tst-clone3.c      |  28 +++---
>  sysdeps/unix/sysv/linux/tst-getpid1.c     |  25 +++---
>  sysdeps/unix/sysv/linux/x86_64/clone3.S   | 105 ++++++++++++++++++++++
>  18 files changed, 347 insertions(+), 158 deletions(-)
>  create mode 100644 sysdeps/generic/clone-internal.h
>  create mode 100644 sysdeps/unix/sysv/linux/clone-internal.c
>  create mode 100644 sysdeps/unix/sysv/linux/clone-offsets.sym
>  create mode 100644 sysdeps/unix/sysv/linux/clone3.c
>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/clone3.S
> 
> diff --git a/include/sched.h b/include/sched.h
> index b0bf971c93..28dd99a571 100644
> --- a/include/sched.h
> +++ b/include/sched.h
> @@ -32,5 +32,8 @@ libc_hidden_proto (__clone2)
>     and Hurd doesn't have it.  */
>  extern int __getcpu (unsigned int *, unsigned int *);
>  libc_hidden_proto (__getcpu)
> +
> +# include <clone-internal.h>
> +libc_hidden_proto (__clone_internal)
>  #endif
>  #endif
> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index 149b999603..73c7f33a00 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -34,47 +34,6 @@
>  #include <stack-aliasing.h>
>  
>  
> -#ifndef NEED_SEPARATE_REGISTER_STACK
> -
> -/* Most architectures have exactly one stack pointer.  Some have more.  */
> -# define STACK_VARIABLES void *stackaddr = NULL
> -
> -/* How to pass the values to the 'create_thread' function.  */
> -# define STACK_VARIABLES_ARGS stackaddr
> -
> -/* How to declare function which gets there parameters.  */
> -# define STACK_VARIABLES_PARMS void *stackaddr
> -
> -/* How to declare allocate_stack.  */
> -# define ALLOCATE_STACK_PARMS void **stack
> -
> -/* This is how the function is called.  We do it this way to allow
> -   other variants of the function to have more parameters.  */
> -# define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
> -
> -#else
> -
> -/* We need two stacks.  The kernel will place them but we have to tell
> -   the kernel about the size of the reserved address space.  */
> -# define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
> -
> -/* How to pass the values to the 'create_thread' function.  */
> -# define STACK_VARIABLES_ARGS stackaddr, stacksize
> -
> -/* How to declare function which gets there parameters.  */
> -# define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
> -
> -/* How to declare allocate_stack.  */
> -# define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
> -
> -/* This is how the function is called.  We do it this way to allow
> -   other variants of the function to have more parameters.  */
> -# define ALLOCATE_STACK(attr, pd) \
> -  allocate_stack (attr, pd, &stackaddr, &stacksize)
> -
> -#endif
> -
> -
>  /* Default alignment of stack.  */
>  #ifndef STACK_ALIGN
>  # define STACK_ALIGN __alignof__ (long double)
> @@ -399,7 +358,7 @@ advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
>     PDP must be non-NULL.  */
>  static int
>  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
> -		ALLOCATE_STACK_PARMS)
> +		void **stack, size_t *stacksize)
>  {
>    struct pthread *pd;
>    size_t size;
> @@ -760,25 +719,17 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>    /* We place the thread descriptor at the end of the stack.  */
>    *pdp = pd;
>  
> -#if _STACK_GROWS_DOWN
>    void *stacktop;
>  
> -# if TLS_TCB_AT_TP
> +#if TLS_TCB_AT_TP
>    /* The stack begins before the TCB and the static TLS block.  */
>    stacktop = ((char *) (pd + 1) - __static_tls_size);
> -# elif TLS_DTV_AT_TP
> +#elif TLS_DTV_AT_TP
>    stacktop = (char *) (pd - 1);
> -# endif
> +#endif
>  
> -# ifdef NEED_SEPARATE_REGISTER_STACK
> -  *stack = pd->stackblock;
> -  *stacksize = stacktop - *stack;
> -# else
> -  *stack = stacktop;
> -# endif
> -#else
> +  *stacksize = stacktop - pd->stackblock;
>    *stack = pd->stackblock;
> -#endif
>  
>    return 0;
>  }
> diff --git a/nptl/createthread.c b/nptl/createthread.c
> index 46943b33fe..2ac83111ec 100644
> --- a/nptl/createthread.c
> +++ b/nptl/createthread.c
> @@ -25,7 +25,8 @@
>  
>  static int
>  create_thread (struct pthread *pd, const struct pthread_attr *attr,
> -	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
> +	       bool *stopped_start, void *stackaddr, size_t stacksize,
> +	       bool *thread_ran)
>  {
>    /* If the implementation needs to do some tweaks to the thread after
>       it has been created at the OS level, it can set STOPPED_START here.  */
> diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> index 6c645aff48..2cc3d25697 100644
> --- a/nptl/pthread_create.c
> +++ b/nptl/pthread_create.c
> @@ -201,8 +201,8 @@ unsigned int __nptl_nthreads = 1;
>     be set to true iff the thread actually started up and then got
>     canceled before calling user code (*PD->start_routine).  */
>  static int create_thread (struct pthread *pd, const struct pthread_attr *attr,
> -			  bool *stopped_start, STACK_VARIABLES_PARMS,
> -			  bool *thread_ran);
> +			  bool *stopped_start, void *stackaddr,
> +			  size_t stacksize, bool *thread_ran);
>  
>  #include <createthread.c>
>  
> @@ -621,7 +621,8 @@ int
>  __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
>  		      void *(*start_routine) (void *), void *arg)
>  {
> -  STACK_VARIABLES;
> +  void *stackaddr = NULL;
> +  size_t stacksize = 0;
>  
>    /* Avoid a data race in the multi-threaded case.  */
>    if (__libc_single_threaded)
> @@ -641,7 +642,7 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
>      }
>  
>    struct pthread *pd = NULL;
> -  int err = ALLOCATE_STACK (iattr, &pd);
> +  int err = allocate_stack (iattr, &pd, &stackaddr, &stacksize);
>    int retval = 0;
>  
>    if (__glibc_unlikely (err != 0))
> @@ -786,8 +787,8 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
>  
>        /* We always create the thread stopped at startup so we can
>  	 notify the debugger.  */
> -      retval = create_thread (pd, iattr, &stopped_start,
> -			      STACK_VARIABLES_ARGS, &thread_ran);
> +      retval = create_thread (pd, iattr, &stopped_start, stackaddr,
> +			      stacksize, &thread_ran);
>        if (retval == 0)
>  	{
>  	  /* We retain ownership of PD until (a) (see CONCURRENCY NOTES
> @@ -818,8 +819,8 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
>  	}
>      }
>    else
> -    retval = create_thread (pd, iattr, &stopped_start,
> -			    STACK_VARIABLES_ARGS, &thread_ran);
> +    retval = create_thread (pd, iattr, &stopped_start, stackaddr,
> +			    stacksize, &thread_ran);
>  
>    /* Return to the previous signal mask, after creating the new
>       thread.  */
> diff --git a/sysdeps/generic/clone-internal.h b/sysdeps/generic/clone-internal.h
> new file mode 100644
> index 0000000000..1c1f997ca5
> --- /dev/null
> +++ b/sysdeps/generic/clone-internal.h
> @@ -0,0 +1,49 @@
> +/* The internal wrapper of clone and clone3.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdint.h>
> +#include <stddef.h>
> +
> +struct clone_args
> +{
> +  uint64_t flags;	 /* Flags bit mask.  */
> +  uint64_t pidfd;	 /* Where to store PID file descriptor
> +			    (pid_t *).  */
	
I think that's just a typ but just to make sure: this should be an int not a pid_t.

> +  uint64_t child_tid;	 /* Where to store child TID, in child's memory
> +			    (pid_t *).  */
> +  uint64_t parent_tid;	 /* Where to store child TID, in parent's memory
> +			    (int *). */
> +  uint64_t exit_signal;	 /* Signal to deliver to parent on child
> +			    termination */
> +  uint64_t stack;	 /* The lowest address of stack.  */
> +  uint64_t stack_size;	 /* Size of stack.  */
> +  uint64_t tls;		 /* Location of new TLS.  */
> +  uint64_t set_tid;	 /* Pointer to a pid_t array
> +			    (since Linux 5.5).  */
> +  uint64_t set_tid_size; /* Number of elements in set_tid
> +			    (since Linux 5.5). */
> +  uint64_t cgroup;	 /* File descriptor for target cgroup
> +			    of child (since Linux 5.7).  */
> +};
> +
> +extern int __clone3 (struct clone_args *cl_args, size_t size,
> +		     int (*__fn) (void *__arg), void *__arg)
> +  __attribute__ ((visibility ("hidden")));
> +
> +/* The internal wrapper of clone and clone3.  */
> +extern __typeof (__clone3) __clone_internal;
> diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> index 51e28b97ac..499f2435eb 100644
> --- a/sysdeps/unix/sysv/linux/Makefile
> +++ b/sysdeps/unix/sysv/linux/Makefile
> @@ -54,6 +54,8 @@ CFLAGS-malloc.c += -DMORECORE_CLEARS=2
>  endif
>  
>  ifeq ($(subdir),misc)
> +gen-as-const-headers += clone-offsets.sym
> +
>  sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
>  		   setfsuid setfsgid epoll_pwait signalfd \
>  		   eventfd eventfd_read eventfd_write prlimit \
> @@ -64,7 +66,7 @@ sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
>  		   time64-support pselect32 \
>  		   xstat fxstat lxstat xstat64 fxstat64 lxstat64 \
>  		   fxstatat fxstatat64 \
> -		   xmknod xmknodat
> +		   xmknod xmknodat clone3 clone-internal
>  
>  CFLAGS-gethostid.c = -fexceptions
>  CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables
> diff --git a/sysdeps/unix/sysv/linux/Versions b/sysdeps/unix/sysv/linux/Versions
> index c35f783e2a..9bb13d593f 100644
> --- a/sysdeps/unix/sysv/linux/Versions
> +++ b/sysdeps/unix/sysv/linux/Versions
> @@ -179,5 +179,6 @@ libc {
>      __sigtimedwait;
>      # functions used by nscd
>      __netlink_assert_response;
> +    __clone_internal;
>    }
>  }
> diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
> new file mode 100644
> index 0000000000..7d755cd97f
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/clone-internal.c
> @@ -0,0 +1,61 @@
> +/* The internal wrapper of clone and clone3.  Linux version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <sched.h>
> +#include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
> +
> +int
> +__clone_internal (struct clone_args *cl_args, size_t size,
> +		  int (*fn)(void *arg), void *arg)
> +{
> +  /* Try clone3 first.  */
> +  int ret = __clone3 (cl_args, size, fn, arg);
> +  if (ret == -1 && errno == ENOSYS)
> +    {
> +      /* NB: Must clear errno since errno may be checked against non-zero
> +         return value.  */
> +      __set_errno (0);
> +      /* Map clone3 arguments to clone arguments.  */
> +      int flags = cl_args->flags | cl_args->exit_signal;
> +      void *stack = (void *) (uintptr_t) cl_args->stack;
> +
> +#ifdef __ia64__
> +      ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,
> +		      flags, arg,
> +		      (void *) (uintptr_t) cl_args->parent_tid,
> +		      (void *) (uintptr_t) cl_args->tls,
> +		      (void *) (uintptr_t) cl_args->child_tid);

Should glibc maybe just carry a few simple macros to do this instead of
open-coding the casting? (We do in systemd and LXC for example.):

#define PTR_TO_INT(p) ((int)((intptr_t)(p)))
#define INT_TO_PTR(u) ((void *)((intptr_t)(u)))

#define PTR_TO_PID(p) ((pid_t)((intptr_t)(p)))
#define PID_TO_PTR(u) ((void *)((intptr_t)(u)))

#define PTR_TO_UINT64(p) ((uint64_t)((uintptr_t)(p)))

#define PTR_TO_U64(p) ((__u64)((uintptr_t)(p)))

#define UINT_TO_PTR(u) ((void *) ((uintptr_t) (u)))

#define PTR_TO_USHORT(p) ((unsigned short)((uintptr_t)(p)))

.
.
.

> +#else
> +# if !_STACK_GROWS_DOWN && !_STACK_GROWS_UP
> +#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> +# endif
> +
> +# if _STACK_GROWS_DOWN
> +      stack += cl_args->stack_size;
> +# endif
> +      ret = __clone (fn, stack, flags, arg,
> +		     (void *) (uintptr_t) cl_args->parent_tid,
> +		     (void *) (uintptr_t) cl_args->tls,
> +		     (void *) (uintptr_t) cl_args->child_tid);
> +#endif
> +    }
> +  return ret;
> +}
> +
> +libc_hidden_def (__clone_internal)
> diff --git a/sysdeps/unix/sysv/linux/clone-offsets.sym b/sysdeps/unix/sysv/linux/clone-offsets.sym
> new file mode 100644
> index 0000000000..38d447bcf3
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/clone-offsets.sym
> @@ -0,0 +1,9 @@
> +#include <stddef.h>
> +#include <sched.h>
> +
> +--
> +
> +#define clone_args_offset(member) offsetof (struct clone_args, member)
> +
> +CLONE_ARGS_STACK_OFFSET		clone_args_offset(stack)
> +CLONE_ARGS_STACK_SIZE_OFFSET	clone_args_offset(stack_size)
> diff --git a/sysdeps/unix/sysv/linux/clone3.c b/sysdeps/unix/sysv/linux/clone3.c
> new file mode 100644
> index 0000000000..c7d88e7af7
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/clone3.c
> @@ -0,0 +1,31 @@
> +/* The clone3 syscall wrapper.  Linux version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <sched.h>
> +
> +int
> +__clone3 (struct clone_args *cl_args, size_t size,
> +	  int (*fn)(void *arg), void *arg)
> +{
> +  if (cl_args == NULL || fn == NULL)
> +    __set_errno (EINVAL);
> +  else
> +    __set_errno (ENOSYS);
> +  return -1;
> +}
> diff --git a/sysdeps/unix/sysv/linux/createthread.c b/sysdeps/unix/sysv/linux/createthread.c
> index bc3409b326..599c172de8 100644
> --- a/sysdeps/unix/sysv/linux/createthread.c
> +++ b/sysdeps/unix/sysv/linux/createthread.c
> @@ -28,12 +28,6 @@
>  
>  #include <arch-fork.h>
>  
> -#ifdef __NR_clone2
> -# define ARCH_CLONE __clone2
> -#else
> -# define ARCH_CLONE __clone
> -#endif
> -
>  /* See the comments in pthread_create.c for the requirements for these
>     two macros and the create_thread function.  */
>  
> @@ -47,7 +41,8 @@ static int start_thread (void *arg) __attribute__ ((noreturn));
>  
>  static int
>  create_thread (struct pthread *pd, const struct pthread_attr *attr,
> -	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
> +	       bool *stopped_start, void *stackaddr, size_t stacksize,
> +	       bool *thread_ran)
>  {
>    /* Determine whether the newly created threads has to be started
>       stopped since we have to set the scheduling parameters or set the
> @@ -100,9 +95,18 @@ create_thread (struct pthread *pd, const struct pthread_attr *attr,
>  
>    TLS_DEFINE_INIT_TP (tp, pd);
>  
> -  if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
> -				    clone_flags, pd, &pd->tid, tp, &pd->tid)
> -			== -1))
> +  struct clone_args args =
> +    {
> +      .flags = clone_flags,
> +      .pidfd = (uintptr_t) &pd->tid,
> +      .parent_tid = (uintptr_t) &pd->tid,
> +      .child_tid = (uintptr_t) &pd->tid,
> +      .stack = (uintptr_t) stackaddr,
> +      .stack_size = stacksize,
> +      .tls = (uintptr_t) tp,
> +    };
> +  int ret = __clone_internal (&args, sizeof (args), &start_thread, pd);
> +  if (__glibc_unlikely (ret == -1))
>      return errno;
>  
>    /* It's started now, so if we fail below, we'll have to cancel it
> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
> index b53b81b8fc..c63e2b65f1 100644
> --- a/sysdeps/unix/sysv/linux/spawni.c
> +++ b/sysdeps/unix/sysv/linux/spawni.c
> @@ -59,21 +59,6 @@
>     normal program exit with the exit code 127.  */
>  #define SPAWN_ERROR	127
>  
> -#ifdef __ia64__
> -# define CLONE(__fn, __stackbase, __stacksize, __flags, __args) \
> -  __clone2 (__fn, __stackbase, __stacksize, __flags, __args, 0, 0, 0)
> -#else
> -# define CLONE(__fn, __stack, __stacksize, __flags, __args) \
> -  __clone (__fn, __stack, __flags, __args)
> -#endif
> -
> -/* Since ia64 wants the stackbase w/clone2, re-use the grows-up macro.  */
> -#if _STACK_GROWS_UP || defined (__ia64__)
> -# define STACK(__stack, __stack_size) (__stack)
> -#elif _STACK_GROWS_DOWN
> -# define STACK(__stack, __stack_size) (__stack + __stack_size)
> -#endif
> -
>  
>  struct posix_spawn_args
>  {
> @@ -379,8 +364,15 @@ __spawnix (pid_t * pid, const char *file,
>       need for CLONE_SETTLS.  Although parent and child share the same TLS
>       namespace, there will be no concurrent access for TLS variables (errno
>       for instance).  */
> -  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
> -		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
> +  struct clone_args clone_args =
> +    {
> +      .flags = CLONE_VM | CLONE_VFORK,
> +      .exit_signal = SIGCHLD,
> +      .stack = (uintptr_t) stack,
> +      .stack_size = stack_size,
> +    };
> +  new_pid = __clone_internal (&clone_args, sizeof (clone_args),
> +			      __spawni_child, &args);
>  
>    /* It needs to collect the case where the auxiliary process was created
>       but failed to execute the file (due either any preparation step or
> diff --git a/sysdeps/unix/sysv/linux/tst-align-clone.c b/sysdeps/unix/sysv/linux/tst-align-clone.c
> index 6ace61bac3..5f00b8ee4a 100644
> --- a/sysdeps/unix/sysv/linux/tst-align-clone.c
> +++ b/sysdeps/unix/sysv/linux/tst-align-clone.c
> @@ -23,6 +23,7 @@
>  #include <sys/wait.h>
>  #include <unistd.h>
>  #include <tst-stack-align.h>
> +#include <clone-internal.h>
>  #include <stackinfo.h>
>  
>  static int
> @@ -49,21 +50,17 @@ do_test (void)
>      ok = false;
>  
>  #ifdef __ia64__
> -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		       size_t __child_stack_size, int __flags,
> -		       void *__arg, ...);
> -  char st[256 * 1024];
> -  pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
> +# define STACK_SIZE 256 * 1024
>  #else
> -  char st[128 * 1024] __attribute__ ((aligned));
> -# if _STACK_GROWS_DOWN
> -  pid_t p = clone (f, st + sizeof (st), 0, 0);
> -# elif _STACK_GROWS_UP
> -  pid_t p = clone (f, st, 0, 0);
> -# else
> -#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> -# endif
> +# define STACK_SIZE 128 * 1024
>  #endif
> +  char st[STACK_SIZE] __attribute__ ((aligned));
> +  struct clone_args clone_args =
> +    {
> +      .stack = (uintptr_t) st,
> +      .stack_size = sizeof (st),
> +    };
> +  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
>    if (p == -1)
>      {
>        printf("clone failed: %m\n");
> diff --git a/sysdeps/unix/sysv/linux/tst-clone.c b/sysdeps/unix/sysv/linux/tst-clone.c
> index e6ae0106ef..2865182f05 100644
> --- a/sysdeps/unix/sysv/linux/tst-clone.c
> +++ b/sysdeps/unix/sysv/linux/tst-clone.c
> @@ -22,11 +22,7 @@
>  #include <stdlib.h>
>  #include <unistd.h>
>  #include <sched.h>
> -
> -#ifdef __ia64__
> -extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		     size_t __child_stack_size, int __flags, void *__arg, ...);
> -#endif
> +#include <clone-internal.h>
>  
>  int child_fn(void *arg)
>  {
> @@ -39,11 +35,7 @@ do_test (void)
>  {
>    int result;
>  
> -#ifdef __ia64__
> -  result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
> -#else
> -  result = clone (child_fn, NULL, 0, NULL);
> -#endif
> +  result = __clone_internal (NULL, 0, child_fn, NULL);
>  
>    if (errno != EINVAL || result != -1)
>      {
> diff --git a/sysdeps/unix/sysv/linux/tst-clone2.c b/sysdeps/unix/sysv/linux/tst-clone2.c
> index ce36c70c0d..fda77ac48b 100644
> --- a/sysdeps/unix/sysv/linux/tst-clone2.c
> +++ b/sysdeps/unix/sysv/linux/tst-clone2.c
> @@ -26,9 +26,11 @@
>  #include <stdbool.h>
>  #include <stdint.h>
>  #include <stdlib.h>
> +#include <errno.h>
>  #include <sys/types.h>
>  #include <sys/wait.h>
>  #include <sys/syscall.h>
> +#include <clone-internal.h>
>  
>  #include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
>  
> @@ -70,23 +72,18 @@ do_test (void)
>    if (pipe2 (pipefd, O_CLOEXEC))
>      FAIL_EXIT1 ("pipe failed: %m");
>  
> -  int clone_flags = 0;
>  #ifdef __ia64__
> -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		       size_t __child_stack_size, int __flags,
> -		       void *__arg, ...);
> -  char st[256 * 1024] __attribute__ ((aligned));
> -  pid_t p = __clone2 (f, st, sizeof (st), clone_flags, 0);
> +# define STACK_SIZE 256 * 1024
>  #else
> -  char st[128 * 1024] __attribute__ ((aligned));
> -#if _STACK_GROWS_DOWN
> -  pid_t p = clone (f, st + sizeof (st), clone_flags, 0);
> -#elif _STACK_GROWS_UP
> -  pid_t p = clone (f, st, clone_flags, 0);
> -#else
> -#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> -#endif
> +# define STACK_SIZE 128 * 1024
>  #endif
> +  char st[STACK_SIZE] __attribute__ ((aligned));
> +  struct clone_args clone_args =
> +    {
> +      .stack = (uintptr_t) st,
> +      .stack_size = sizeof (st),
> +    };
> +  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
>  
>    close (pipefd[1]);
>  
> diff --git a/sysdeps/unix/sysv/linux/tst-clone3.c b/sysdeps/unix/sysv/linux/tst-clone3.c
> index 1a1bfd4586..3e505807b6 100644
> --- a/sysdeps/unix/sysv/linux/tst-clone3.c
> +++ b/sysdeps/unix/sysv/linux/tst-clone3.c
> @@ -20,10 +20,12 @@
>  #include <sched.h>
>  #include <signal.h>
>  #include <unistd.h>
> +#include <errno.h>
>  #include <sys/syscall.h>
>  #include <sys/wait.h>
>  #include <sys/types.h>
>  #include <linux/futex.h>
> +#include <clone-internal.h>
>  
>  #include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
>  #include <support/check.h>
> @@ -78,23 +80,15 @@ do_test (void)
>    pid_t ctid = CTID_INIT_VAL;
>    pid_t tid;
>  
> -#ifdef __ia64__
> -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		       size_t __child_stack_size, int __flags,
> -		       void *__arg, ...);
> -  tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> -		  /* tls */ NULL, &ctid);
> -#else
> -#if _STACK_GROWS_DOWN
> -  tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> -	       /* tls */ NULL, &ctid);
> -#elif _STACK_GROWS_UP
> -  tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
> -	       &ctid);
> -#else
> -#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> -#endif
> -#endif
> +  struct clone_args clone_args =
> +    {
> +      .flags = clone_flags & ~CSIGNAL,
> +      .exit_signal = clone_flags & CSIGNAL,
> +      .stack = (uintptr_t) st,
> +      .stack_size = sizeof (st),
> +      .child_tid = (uintptr_t) &ctid,
> +    };
> +  tid = __clone_internal (&clone_args, sizeof (clone_args), f, NULL);
>    if (tid == -1)
>      FAIL_EXIT1 ("clone failed: %m");
>  
> diff --git a/sysdeps/unix/sysv/linux/tst-getpid1.c b/sysdeps/unix/sysv/linux/tst-getpid1.c
> index 253ebf2e15..b78f56df4f 100644
> --- a/sysdeps/unix/sysv/linux/tst-getpid1.c
> +++ b/sysdeps/unix/sysv/linux/tst-getpid1.c
> @@ -5,6 +5,7 @@
>  #include <unistd.h>
>  #include <sys/types.h>
>  #include <sys/wait.h>
> +#include <clone-internal.h>
>  #include <stackinfo.h>
>  
>  #ifndef TEST_CLONE_FLAGS
> @@ -42,21 +43,19 @@ do_test (void)
>      }
>  
>  #ifdef __ia64__
> -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		       size_t __child_stack_size, int __flags,
> -		       void *__arg, ...);
> -  char st[256 * 1024] __attribute__ ((aligned));
> -  pid_t p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
> +# define STACK_SIZE 256 * 1024
>  #else
> -  char st[128 * 1024] __attribute__ ((aligned));
> -# if _STACK_GROWS_DOWN
> -  pid_t p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
> -# elif _STACK_GROWS_UP
> -  pid_t p = clone (f, st, TEST_CLONE_FLAGS, 0);
> -# else
> -#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> -# endif
> +# define STACK_SIZE 128 * 1024
>  #endif
> +  char st[STACK_SIZE] __attribute__ ((aligned));
> +  struct clone_args clone_args =
> +    {
> +      .flags = TEST_CLONE_FLAGS & ~CSIGNAL,
> +      .exit_signal = TEST_CLONE_FLAGS & CSIGNAL,
> +      .stack = (uintptr_t) st,
> +      .stack_size = sizeof (st),
> +    };
> +  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
>    if (p == -1)
>      {
>        printf("clone failed: %m\n");
> diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> new file mode 100644
> index 0000000000..4d99b4b881
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> @@ -0,0 +1,105 @@
> +/* The clone3 syscall wrapper.  Linux/x86-64 version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* clone3() is even more special than fork() as it mucks with stacks
> +   and invokes a function in the right context after its all over.  */
> +
> +#include <sysdep.h>
> +#include <clone-offsets.h>
> +
> +/* The userland implementation is:
> +   int clone3 (struct clone_args *cl_args, size_t size,
> +	       int (*fn)(void *arg), void *arg);
> +   the kernel entry is:
> +   int clone3 (struct clone_args *cl_args, size_t size);
> +
> +   The parameters are passed in registers from userland:
> +   rdi: cl_args
> +   rsi: size
> +   rdx: fn
> +   rcx: arg
> +
> +   The kernel expects:
> +   rax: system call number
> +   rdi: cl_args
> +   rsi: size  */
> +
> +        .text
> +ENTRY (__clone3)
> +	/* Sanity check arguments.  */
> +	movq	$-EINVAL, %rax
> +	testq	%rdi, %rdi		/* No NULL cl_args pointer.  */
> +	jz	SYSCALL_ERROR_LABEL
> +	testq	%rdx, %rdx		/* No NULL function pointer.  */
> +	jz	SYSCALL_ERROR_LABEL
> +
> +	/* Load the new stack size into R9.  */
> +	movq	CLONE_ARGS_STACK_SIZE_OFFSET(%rdi), %r9
> +
> +	/* Load the new stack bottom into R8.  */
> +	movq	CLONE_ARGS_STACK_OFFSET(%rdi), %r8
> +
> +	/* Make room for 2 8-byte slots on the new stack.  */
> +	subq	$16, %r9
> +
> +	/* Insert the argument onto the new stack.  */
> +	movq	%rcx, 8(%r8, %r9)
> +
> +	/* Save the function pointer.  It will be popped off in the
> +	   child.  */
> +	movq	%rdx, (%r8, %r9)
> +
> +	/* Update the stack size field passed to clone3.  */
> +	movq	%r9, CLONE_ARGS_STACK_SIZE_OFFSET(%rdi)
> +
> +	/* Do the system call.  */
> +	movl	$SYS_ify(clone3), %eax
> +
> +	/* End FDE now, because in the child the unwind info will be
> +	   wrong.  */
> +	cfi_endproc;
> +	syscall
> +
> +	test	%RAX_LP, %RAX_LP
> +	jl	SYSCALL_ERROR_LABEL
> +	jz	L(thread_start)
> +
> +	ret
> +
> +L(thread_start):
> +	cfi_startproc;
> +	/* Clearing frame pointer is insufficient, use CFI.  */
> +	cfi_undefined (rip);
> +	/* Clear the frame pointer.  The ABI suggests this be done, to mark
> +	   the outermost frame obviously.  */
> +	xorl	%ebp, %ebp
> +
> +	/* Set up arguments for the function call.  */
> +	popq	%rax		/* Function to call.  */
> +	popq	%rdi		/* Argument.  */
> +	call	*%rax
> +	/* Call exit with return value from function call. */
> +	movq	%rax, %rdi
> +	movl	$SYS_ify(exit), %eax
> +	syscall
> +	cfi_endproc;
> +
> +	cfi_startproc;
> +PSEUDO_END (__clone3)
> +
> +libc_hidden_def (__clone3)
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3] Add an internal wrapper for clone, clone2 and clone3
  2021-03-05  8:32         ` Christian Brauner
@ 2021-03-05 16:59           ` H.J. Lu via Libc-alpha
  2021-03-18 23:18             ` PING^1 " H.J. Lu via Libc-alpha
                               ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-03-05 16:59 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Florian Weimer, H.J. Lu via Libc-alpha

[-- Attachment #1: Type: text/plain, Size: 3457 bytes --]

On Fri, Mar 5, 2021 at 12:39 AM Christian Brauner
<christian.brauner@ubuntu.com> wrote:
>
> On Thu, Mar 04, 2021 at 10:50:29AM -0800, H.J. Lu via Libc-alpha wrote:
> > On Thu, Mar 4, 2021 at 10:40 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Feb 15, 2021 at 6:56 AM Adhemerval Zanella via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > >
> > > >
> > > > On 15/02/2021 07:44, Florian Weimer via Libc-alpha wrote:
> > > > > * H. J. Lu via Libc-alpha:
> > > > >
> > > > >> The clone3 system call provides a superset of the functionality of clone
> > > > >> and clone2.  It also provides a number of API improve ments, including
> > > > >> the ability to specify the size of the child's stack area which can be
> > > > >> used by kernel to compute the shadow stack size when allocating the
> > > > >> shadow stack.  Add:
> > > > >>
> > > > >> extern int __clone_internal (struct clone_args *cl_args, size_t size,
> > > > >>                           int (*__fn) (void *__arg), void *__arg);
> > > > >>
> > > > >> to provide an abstract interface for clone, clone2 and clone3.
> > > > >>
> > > > >> 1. Simplify stack management for clone by passing stack base and size
> > > > >> to __clone_internal.
> > > > >> 2. Consolidate clone vs clone2 differences into a single file.
> > > > >> 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> > > > >> with ENOSYS, __clone_internal will fall back to clone or clone2.
> > > > >> 4. Add the x86-64 clone3 wrapper.
> > > > >> 5. Enable the public clone3 wrapper in the future after it has been
> > > > >> added to add targets.
> > > > >
> > > > > What do you think about providing a clone wrapper which reuses the
> > > > > caller's stack?  That would be useful for a (v)fork-style clone.  This
> > > > > variant could also be exported because the callback inherits a
> > > > > semi-valid TCB in this case.
> > > > I have this one as a TODO to improve the posix_spawn.
> > >
> > > Here is the v2 patch:
> > >
> > > 1. Update the stack size field, instead of the stack field, passed to
> > > clone3.
> > >
> > > OK for master?
> >
> > Really add the patch this time :-).
> >
> > --
> > H.J.
>
> Just a small comment.
>

> > +      __set_errno (0);
> > +      /* Map clone3 arguments to clone arguments.  */
> > +      int flags = cl_args->flags | cl_args->exit_signal;
> > +      void *stack = (void *) (uintptr_t) cl_args->stack;
> > +
> > +#ifdef __ia64__
> > +      ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,
> > +                   flags, arg,
> > +                   (void *) (uintptr_t) cl_args->parent_tid,
> > +                   (void *) (uintptr_t) cl_args->tls,
> > +                   (void *) (uintptr_t) cl_args->child_tid);
>
> Should glibc maybe just carry a few simple macros to do this instead of
> open-coding the casting? (We do in systemd and LXC for example.):
>
> #define PTR_TO_INT(p) ((int)((intptr_t)(p)))
> #define INT_TO_PTR(u) ((void *)((intptr_t)(u)))
>
> #define PTR_TO_PID(p) ((pid_t)((intptr_t)(p)))
> #define PID_TO_PTR(u) ((void *)((intptr_t)(u)))
>
> #define PTR_TO_UINT64(p) ((uint64_t)((uintptr_t)(p)))
>
> #define PTR_TO_U64(p) ((__u64)((uintptr_t)(p)))
>
> #define UINT_TO_PTR(u) ((void *) ((uintptr_t) (u)))
>
> #define PTR_TO_USHORT(p) ((unsigned short)((uintptr_t)(p)))
>

Fixed.

Here is the v3 patch.   OK for master?

Changes from v2:

1. Add cast_to_pointer to cast an integer to void * pointer.

-- 
H.J.

[-- Attachment #2: v3-0001-Add-an-internal-wrapper-for-clone-clone2-and-clon.patch --]
[-- Type: text/x-patch, Size: 30227 bytes --]

From 6abddd1f3a351de0841faa4d6f56e81dc7ff5438 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 13 Feb 2021 11:47:46 -0800
Subject: [PATCH v3] Add an internal wrapper for clone, clone2 and clone3

The clone3 system call provides a superset of the functionality of clone
and clone2.  It also provides a number of API improve ments, including
the ability to specify the size of the child's stack area which can be
used by kernel to compute the shadow stack size when allocating the
shadow stack.  Add:

extern int __clone_internal (struct clone_args *cl_args, size_t size,
			     int (*__fn) (void *__arg), void *__arg);

to provide an abstract interface for clone, clone2 and clone3.

1. Add cast_to_pointer to cast an integer to void * pointer.
2. Simplify stack management for clone by passing stack base and size
to __clone_internal.
3. Consolidate clone vs clone2 differences into a single file.
4. Use only __clone_internal to clone a thread.  If clone3 returns -1
with ENOSYS, __clone_internal will fall back to clone or clone2.
5. Add the x86-64 clone3 wrapper.
6. Enable the public clone3 wrapper in the future after it has been
added to all targets.

Tested with build-many-glibcs.py.  The x86-64 clone3 wrapper and the
i686 clone fallback are tested under 5.10.15 kernel.
---
 include/libc-pointer-arith.h              |   3 +
 include/sched.h                           |   3 +
 nptl/allocatestack.c                      |  59 ++----------
 nptl/createthread.c                       |   3 +-
 nptl/pthread_create.c                     |  17 ++--
 sysdeps/generic/clone-internal.h          |  49 ++++++++++
 sysdeps/unix/sysv/linux/Makefile          |   4 +-
 sysdeps/unix/sysv/linux/Versions          |   1 +
 sysdeps/unix/sysv/linux/clone-internal.c  |  62 +++++++++++++
 sysdeps/unix/sysv/linux/clone-offsets.sym |   9 ++
 sysdeps/unix/sysv/linux/clone3.c          |  31 +++++++
 sysdeps/unix/sysv/linux/createthread.c    |  24 ++---
 sysdeps/unix/sysv/linux/spawni.c          |  26 ++----
 sysdeps/unix/sysv/linux/tst-align-clone.c |  23 +++--
 sysdeps/unix/sysv/linux/tst-clone.c       |  12 +--
 sysdeps/unix/sysv/linux/tst-clone2.c      |  25 +++---
 sysdeps/unix/sysv/linux/tst-clone3.c      |  28 +++---
 sysdeps/unix/sysv/linux/tst-getpid1.c     |  25 +++---
 sysdeps/unix/sysv/linux/x86_64/clone3.S   | 105 ++++++++++++++++++++++
 19 files changed, 351 insertions(+), 158 deletions(-)
 create mode 100644 sysdeps/generic/clone-internal.h
 create mode 100644 sysdeps/unix/sysv/linux/clone-internal.c
 create mode 100644 sysdeps/unix/sysv/linux/clone-offsets.sym
 create mode 100644 sysdeps/unix/sysv/linux/clone3.c
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/clone3.S

diff --git a/include/libc-pointer-arith.h b/include/libc-pointer-arith.h
index 72e722c5aa..04ba537617 100644
--- a/include/libc-pointer-arith.h
+++ b/include/libc-pointer-arith.h
@@ -37,6 +37,9 @@
 /* Cast an integer or a pointer VAL to integer with proper type.  */
 # define cast_to_integer(val) ((__integer_if_pointer_type (val)) (val))
 
+/* Cast an integer VAL to void * pointer.  */
+# define cast_to_pointer(val) ((void *) (uintptr_t) (val))
+
 /* Align a value by rounding down to closest size.
    e.g. Using size of 4096, we get this behavior:
 	{4095, 4096, 4097} = {0, 4096, 4096}.  */
diff --git a/include/sched.h b/include/sched.h
index b0bf971c93..28dd99a571 100644
--- a/include/sched.h
+++ b/include/sched.h
@@ -32,5 +32,8 @@ libc_hidden_proto (__clone2)
    and Hurd doesn't have it.  */
 extern int __getcpu (unsigned int *, unsigned int *);
 libc_hidden_proto (__getcpu)
+
+# include <clone-internal.h>
+libc_hidden_proto (__clone_internal)
 #endif
 #endif
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index 149b999603..73c7f33a00 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -34,47 +34,6 @@
 #include <stack-aliasing.h>
 
 
-#ifndef NEED_SEPARATE_REGISTER_STACK
-
-/* Most architectures have exactly one stack pointer.  Some have more.  */
-# define STACK_VARIABLES void *stackaddr = NULL
-
-/* How to pass the values to the 'create_thread' function.  */
-# define STACK_VARIABLES_ARGS stackaddr
-
-/* How to declare function which gets there parameters.  */
-# define STACK_VARIABLES_PARMS void *stackaddr
-
-/* How to declare allocate_stack.  */
-# define ALLOCATE_STACK_PARMS void **stack
-
-/* This is how the function is called.  We do it this way to allow
-   other variants of the function to have more parameters.  */
-# define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
-
-#else
-
-/* We need two stacks.  The kernel will place them but we have to tell
-   the kernel about the size of the reserved address space.  */
-# define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
-
-/* How to pass the values to the 'create_thread' function.  */
-# define STACK_VARIABLES_ARGS stackaddr, stacksize
-
-/* How to declare function which gets there parameters.  */
-# define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
-
-/* How to declare allocate_stack.  */
-# define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
-
-/* This is how the function is called.  We do it this way to allow
-   other variants of the function to have more parameters.  */
-# define ALLOCATE_STACK(attr, pd) \
-  allocate_stack (attr, pd, &stackaddr, &stacksize)
-
-#endif
-
-
 /* Default alignment of stack.  */
 #ifndef STACK_ALIGN
 # define STACK_ALIGN __alignof__ (long double)
@@ -399,7 +358,7 @@ advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
    PDP must be non-NULL.  */
 static int
 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
-		ALLOCATE_STACK_PARMS)
+		void **stack, size_t *stacksize)
 {
   struct pthread *pd;
   size_t size;
@@ -760,25 +719,17 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
   /* We place the thread descriptor at the end of the stack.  */
   *pdp = pd;
 
-#if _STACK_GROWS_DOWN
   void *stacktop;
 
-# if TLS_TCB_AT_TP
+#if TLS_TCB_AT_TP
   /* The stack begins before the TCB and the static TLS block.  */
   stacktop = ((char *) (pd + 1) - __static_tls_size);
-# elif TLS_DTV_AT_TP
+#elif TLS_DTV_AT_TP
   stacktop = (char *) (pd - 1);
-# endif
+#endif
 
-# ifdef NEED_SEPARATE_REGISTER_STACK
-  *stack = pd->stackblock;
-  *stacksize = stacktop - *stack;
-# else
-  *stack = stacktop;
-# endif
-#else
+  *stacksize = stacktop - pd->stackblock;
   *stack = pd->stackblock;
-#endif
 
   return 0;
 }
diff --git a/nptl/createthread.c b/nptl/createthread.c
index 46943b33fe..2ac83111ec 100644
--- a/nptl/createthread.c
+++ b/nptl/createthread.c
@@ -25,7 +25,8 @@
 
 static int
 create_thread (struct pthread *pd, const struct pthread_attr *attr,
-	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
+	       bool *stopped_start, void *stackaddr, size_t stacksize,
+	       bool *thread_ran)
 {
   /* If the implementation needs to do some tweaks to the thread after
      it has been created at the OS level, it can set STOPPED_START here.  */
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 6c645aff48..2cc3d25697 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -201,8 +201,8 @@ unsigned int __nptl_nthreads = 1;
    be set to true iff the thread actually started up and then got
    canceled before calling user code (*PD->start_routine).  */
 static int create_thread (struct pthread *pd, const struct pthread_attr *attr,
-			  bool *stopped_start, STACK_VARIABLES_PARMS,
-			  bool *thread_ran);
+			  bool *stopped_start, void *stackaddr,
+			  size_t stacksize, bool *thread_ran);
 
 #include <createthread.c>
 
@@ -621,7 +621,8 @@ int
 __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 		      void *(*start_routine) (void *), void *arg)
 {
-  STACK_VARIABLES;
+  void *stackaddr = NULL;
+  size_t stacksize = 0;
 
   /* Avoid a data race in the multi-threaded case.  */
   if (__libc_single_threaded)
@@ -641,7 +642,7 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
     }
 
   struct pthread *pd = NULL;
-  int err = ALLOCATE_STACK (iattr, &pd);
+  int err = allocate_stack (iattr, &pd, &stackaddr, &stacksize);
   int retval = 0;
 
   if (__glibc_unlikely (err != 0))
@@ -786,8 +787,8 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 
       /* We always create the thread stopped at startup so we can
 	 notify the debugger.  */
-      retval = create_thread (pd, iattr, &stopped_start,
-			      STACK_VARIABLES_ARGS, &thread_ran);
+      retval = create_thread (pd, iattr, &stopped_start, stackaddr,
+			      stacksize, &thread_ran);
       if (retval == 0)
 	{
 	  /* We retain ownership of PD until (a) (see CONCURRENCY NOTES
@@ -818,8 +819,8 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 	}
     }
   else
-    retval = create_thread (pd, iattr, &stopped_start,
-			    STACK_VARIABLES_ARGS, &thread_ran);
+    retval = create_thread (pd, iattr, &stopped_start, stackaddr,
+			    stacksize, &thread_ran);
 
   /* Return to the previous signal mask, after creating the new
      thread.  */
diff --git a/sysdeps/generic/clone-internal.h b/sysdeps/generic/clone-internal.h
new file mode 100644
index 0000000000..1c1f997ca5
--- /dev/null
+++ b/sysdeps/generic/clone-internal.h
@@ -0,0 +1,49 @@
+/* The internal wrapper of clone and clone3.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <stddef.h>
+
+struct clone_args
+{
+  uint64_t flags;	 /* Flags bit mask.  */
+  uint64_t pidfd;	 /* Where to store PID file descriptor
+			    (pid_t *).  */
+  uint64_t child_tid;	 /* Where to store child TID, in child's memory
+			    (pid_t *).  */
+  uint64_t parent_tid;	 /* Where to store child TID, in parent's memory
+			    (int *). */
+  uint64_t exit_signal;	 /* Signal to deliver to parent on child
+			    termination */
+  uint64_t stack;	 /* The lowest address of stack.  */
+  uint64_t stack_size;	 /* Size of stack.  */
+  uint64_t tls;		 /* Location of new TLS.  */
+  uint64_t set_tid;	 /* Pointer to a pid_t array
+			    (since Linux 5.5).  */
+  uint64_t set_tid_size; /* Number of elements in set_tid
+			    (since Linux 5.5). */
+  uint64_t cgroup;	 /* File descriptor for target cgroup
+			    of child (since Linux 5.7).  */
+};
+
+extern int __clone3 (struct clone_args *cl_args, size_t size,
+		     int (*__fn) (void *__arg), void *__arg)
+  __attribute__ ((visibility ("hidden")));
+
+/* The internal wrapper of clone and clone3.  */
+extern __typeof (__clone3) __clone_internal;
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 51e28b97ac..499f2435eb 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -54,6 +54,8 @@ CFLAGS-malloc.c += -DMORECORE_CLEARS=2
 endif
 
 ifeq ($(subdir),misc)
+gen-as-const-headers += clone-offsets.sym
+
 sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
 		   setfsuid setfsgid epoll_pwait signalfd \
 		   eventfd eventfd_read eventfd_write prlimit \
@@ -64,7 +66,7 @@ sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
 		   time64-support pselect32 \
 		   xstat fxstat lxstat xstat64 fxstat64 lxstat64 \
 		   fxstatat fxstatat64 \
-		   xmknod xmknodat
+		   xmknod xmknodat clone3 clone-internal
 
 CFLAGS-gethostid.c = -fexceptions
 CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables
diff --git a/sysdeps/unix/sysv/linux/Versions b/sysdeps/unix/sysv/linux/Versions
index c35f783e2a..9bb13d593f 100644
--- a/sysdeps/unix/sysv/linux/Versions
+++ b/sysdeps/unix/sysv/linux/Versions
@@ -179,5 +179,6 @@ libc {
     __sigtimedwait;
     # functions used by nscd
     __netlink_assert_response;
+    __clone_internal;
   }
 }
diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
new file mode 100644
index 0000000000..fd9929b6df
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone-internal.c
@@ -0,0 +1,62 @@
+/* The internal wrapper of clone and clone3.  Linux version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <sched.h>
+#include <libc-pointer-arith.h>	/* For cast_to_pointer.  */
+#include <stackinfo.h>		/* For _STACK_GROWS_{UP,DOWN}.  */
+
+int
+__clone_internal (struct clone_args *cl_args, size_t size,
+		  int (*fn)(void *arg), void *arg)
+{
+  /* Try clone3 first.  */
+  int ret = __clone3 (cl_args, size, fn, arg);
+  if (ret == -1 && errno == ENOSYS)
+    {
+      /* NB: Must clear errno since errno may be checked against non-zero
+         return value.  */
+      __set_errno (0);
+      /* Map clone3 arguments to clone arguments.  */
+      int flags = cl_args->flags | cl_args->exit_signal;
+      void *stack = cast_to_pointer (cl_args->stack);
+
+#ifdef __ia64__
+      ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,
+		      flags, arg,
+		      cast_to_pointer (cl_args->parent_tid),
+		      cast_to_pointer (cl_args->tls),
+		      cast_to_pointer (cl_args->child_tid));
+#else
+# if !_STACK_GROWS_DOWN && !_STACK_GROWS_UP
+#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
+# endif
+
+# if _STACK_GROWS_DOWN
+      stack += cl_args->stack_size;
+# endif
+      ret = __clone (fn, stack, flags, arg,
+		     (void *) (uintptr_t) cl_args->parent_tid,
+		     (void *) (uintptr_t) cl_args->tls,
+		     (void *) (uintptr_t) cl_args->child_tid);
+#endif
+    }
+  return ret;
+}
+
+libc_hidden_def (__clone_internal)
diff --git a/sysdeps/unix/sysv/linux/clone-offsets.sym b/sysdeps/unix/sysv/linux/clone-offsets.sym
new file mode 100644
index 0000000000..38d447bcf3
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone-offsets.sym
@@ -0,0 +1,9 @@
+#include <stddef.h>
+#include <sched.h>
+
+--
+
+#define clone_args_offset(member) offsetof (struct clone_args, member)
+
+CLONE_ARGS_STACK_OFFSET		clone_args_offset(stack)
+CLONE_ARGS_STACK_SIZE_OFFSET	clone_args_offset(stack_size)
diff --git a/sysdeps/unix/sysv/linux/clone3.c b/sysdeps/unix/sysv/linux/clone3.c
new file mode 100644
index 0000000000..c7d88e7af7
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone3.c
@@ -0,0 +1,31 @@
+/* The clone3 syscall wrapper.  Linux version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <sched.h>
+
+int
+__clone3 (struct clone_args *cl_args, size_t size,
+	  int (*fn)(void *arg), void *arg)
+{
+  if (cl_args == NULL || fn == NULL)
+    __set_errno (EINVAL);
+  else
+    __set_errno (ENOSYS);
+  return -1;
+}
diff --git a/sysdeps/unix/sysv/linux/createthread.c b/sysdeps/unix/sysv/linux/createthread.c
index bc3409b326..599c172de8 100644
--- a/sysdeps/unix/sysv/linux/createthread.c
+++ b/sysdeps/unix/sysv/linux/createthread.c
@@ -28,12 +28,6 @@
 
 #include <arch-fork.h>
 
-#ifdef __NR_clone2
-# define ARCH_CLONE __clone2
-#else
-# define ARCH_CLONE __clone
-#endif
-
 /* See the comments in pthread_create.c for the requirements for these
    two macros and the create_thread function.  */
 
@@ -47,7 +41,8 @@ static int start_thread (void *arg) __attribute__ ((noreturn));
 
 static int
 create_thread (struct pthread *pd, const struct pthread_attr *attr,
-	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
+	       bool *stopped_start, void *stackaddr, size_t stacksize,
+	       bool *thread_ran)
 {
   /* Determine whether the newly created threads has to be started
      stopped since we have to set the scheduling parameters or set the
@@ -100,9 +95,18 @@ create_thread (struct pthread *pd, const struct pthread_attr *attr,
 
   TLS_DEFINE_INIT_TP (tp, pd);
 
-  if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
-				    clone_flags, pd, &pd->tid, tp, &pd->tid)
-			== -1))
+  struct clone_args args =
+    {
+      .flags = clone_flags,
+      .pidfd = (uintptr_t) &pd->tid,
+      .parent_tid = (uintptr_t) &pd->tid,
+      .child_tid = (uintptr_t) &pd->tid,
+      .stack = (uintptr_t) stackaddr,
+      .stack_size = stacksize,
+      .tls = (uintptr_t) tp,
+    };
+  int ret = __clone_internal (&args, sizeof (args), &start_thread, pd);
+  if (__glibc_unlikely (ret == -1))
     return errno;
 
   /* It's started now, so if we fail below, we'll have to cancel it
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index b53b81b8fc..c63e2b65f1 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -59,21 +59,6 @@
    normal program exit with the exit code 127.  */
 #define SPAWN_ERROR	127
 
-#ifdef __ia64__
-# define CLONE(__fn, __stackbase, __stacksize, __flags, __args) \
-  __clone2 (__fn, __stackbase, __stacksize, __flags, __args, 0, 0, 0)
-#else
-# define CLONE(__fn, __stack, __stacksize, __flags, __args) \
-  __clone (__fn, __stack, __flags, __args)
-#endif
-
-/* Since ia64 wants the stackbase w/clone2, re-use the grows-up macro.  */
-#if _STACK_GROWS_UP || defined (__ia64__)
-# define STACK(__stack, __stack_size) (__stack)
-#elif _STACK_GROWS_DOWN
-# define STACK(__stack, __stack_size) (__stack + __stack_size)
-#endif
-
 
 struct posix_spawn_args
 {
@@ -379,8 +364,15 @@ __spawnix (pid_t * pid, const char *file,
      need for CLONE_SETTLS.  Although parent and child share the same TLS
      namespace, there will be no concurrent access for TLS variables (errno
      for instance).  */
-  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
-		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
+  struct clone_args clone_args =
+    {
+      .flags = CLONE_VM | CLONE_VFORK,
+      .exit_signal = SIGCHLD,
+      .stack = (uintptr_t) stack,
+      .stack_size = stack_size,
+    };
+  new_pid = __clone_internal (&clone_args, sizeof (clone_args),
+			      __spawni_child, &args);
 
   /* It needs to collect the case where the auxiliary process was created
      but failed to execute the file (due either any preparation step or
diff --git a/sysdeps/unix/sysv/linux/tst-align-clone.c b/sysdeps/unix/sysv/linux/tst-align-clone.c
index 6ace61bac3..5f00b8ee4a 100644
--- a/sysdeps/unix/sysv/linux/tst-align-clone.c
+++ b/sysdeps/unix/sysv/linux/tst-align-clone.c
@@ -23,6 +23,7 @@
 #include <sys/wait.h>
 #include <unistd.h>
 #include <tst-stack-align.h>
+#include <clone-internal.h>
 #include <stackinfo.h>
 
 static int
@@ -49,21 +50,17 @@ do_test (void)
     ok = false;
 
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024];
-  pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
+# define STACK_SIZE 256 * 1024
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
-# if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), 0, 0);
-# elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, 0, 0);
-# else
-#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-# endif
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+  struct clone_args clone_args =
+    {
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+    };
+  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
   if (p == -1)
     {
       printf("clone failed: %m\n");
diff --git a/sysdeps/unix/sysv/linux/tst-clone.c b/sysdeps/unix/sysv/linux/tst-clone.c
index e6ae0106ef..2865182f05 100644
--- a/sysdeps/unix/sysv/linux/tst-clone.c
+++ b/sysdeps/unix/sysv/linux/tst-clone.c
@@ -22,11 +22,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <sched.h>
-
-#ifdef __ia64__
-extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		     size_t __child_stack_size, int __flags, void *__arg, ...);
-#endif
+#include <clone-internal.h>
 
 int child_fn(void *arg)
 {
@@ -39,11 +35,7 @@ do_test (void)
 {
   int result;
 
-#ifdef __ia64__
-  result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
-#else
-  result = clone (child_fn, NULL, 0, NULL);
-#endif
+  result = __clone_internal (NULL, 0, child_fn, NULL);
 
   if (errno != EINVAL || result != -1)
     {
diff --git a/sysdeps/unix/sysv/linux/tst-clone2.c b/sysdeps/unix/sysv/linux/tst-clone2.c
index ce36c70c0d..fda77ac48b 100644
--- a/sysdeps/unix/sysv/linux/tst-clone2.c
+++ b/sysdeps/unix/sysv/linux/tst-clone2.c
@@ -26,9 +26,11 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <errno.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <sys/syscall.h>
+#include <clone-internal.h>
 
 #include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
 
@@ -70,23 +72,18 @@ do_test (void)
   if (pipe2 (pipefd, O_CLOEXEC))
     FAIL_EXIT1 ("pipe failed: %m");
 
-  int clone_flags = 0;
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024] __attribute__ ((aligned));
-  pid_t p = __clone2 (f, st, sizeof (st), clone_flags, 0);
+# define STACK_SIZE 256 * 1024
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
-#if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), clone_flags, 0);
-#elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, clone_flags, 0);
-#else
-#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-#endif
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+  struct clone_args clone_args =
+    {
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+    };
+  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
 
   close (pipefd[1]);
 
diff --git a/sysdeps/unix/sysv/linux/tst-clone3.c b/sysdeps/unix/sysv/linux/tst-clone3.c
index 1a1bfd4586..3e505807b6 100644
--- a/sysdeps/unix/sysv/linux/tst-clone3.c
+++ b/sysdeps/unix/sysv/linux/tst-clone3.c
@@ -20,10 +20,12 @@
 #include <sched.h>
 #include <signal.h>
 #include <unistd.h>
+#include <errno.h>
 #include <sys/syscall.h>
 #include <sys/wait.h>
 #include <sys/types.h>
 #include <linux/futex.h>
+#include <clone-internal.h>
 
 #include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
 #include <support/check.h>
@@ -78,23 +80,15 @@ do_test (void)
   pid_t ctid = CTID_INIT_VAL;
   pid_t tid;
 
-#ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
-		  /* tls */ NULL, &ctid);
-#else
-#if _STACK_GROWS_DOWN
-  tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
-	       /* tls */ NULL, &ctid);
-#elif _STACK_GROWS_UP
-  tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
-	       &ctid);
-#else
-#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-#endif
-#endif
+  struct clone_args clone_args =
+    {
+      .flags = clone_flags & ~CSIGNAL,
+      .exit_signal = clone_flags & CSIGNAL,
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+      .child_tid = (uintptr_t) &ctid,
+    };
+  tid = __clone_internal (&clone_args, sizeof (clone_args), f, NULL);
   if (tid == -1)
     FAIL_EXIT1 ("clone failed: %m");
 
diff --git a/sysdeps/unix/sysv/linux/tst-getpid1.c b/sysdeps/unix/sysv/linux/tst-getpid1.c
index 253ebf2e15..b78f56df4f 100644
--- a/sysdeps/unix/sysv/linux/tst-getpid1.c
+++ b/sysdeps/unix/sysv/linux/tst-getpid1.c
@@ -5,6 +5,7 @@
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <clone-internal.h>
 #include <stackinfo.h>
 
 #ifndef TEST_CLONE_FLAGS
@@ -42,21 +43,19 @@ do_test (void)
     }
 
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024] __attribute__ ((aligned));
-  pid_t p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
+# define STACK_SIZE 256 * 1024
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
-# if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
-# elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, TEST_CLONE_FLAGS, 0);
-# else
-#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-# endif
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+  struct clone_args clone_args =
+    {
+      .flags = TEST_CLONE_FLAGS & ~CSIGNAL,
+      .exit_signal = TEST_CLONE_FLAGS & CSIGNAL,
+      .stack = (uintptr_t) st,
+      .stack_size = sizeof (st),
+    };
+  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
   if (p == -1)
     {
       printf("clone failed: %m\n");
diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S
new file mode 100644
index 0000000000..4d99b4b881
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S
@@ -0,0 +1,105 @@
+/* The clone3 syscall wrapper.  Linux/x86-64 version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* clone3() is even more special than fork() as it mucks with stacks
+   and invokes a function in the right context after its all over.  */
+
+#include <sysdep.h>
+#include <clone-offsets.h>
+
+/* The userland implementation is:
+   int clone3 (struct clone_args *cl_args, size_t size,
+	       int (*fn)(void *arg), void *arg);
+   the kernel entry is:
+   int clone3 (struct clone_args *cl_args, size_t size);
+
+   The parameters are passed in registers from userland:
+   rdi: cl_args
+   rsi: size
+   rdx: fn
+   rcx: arg
+
+   The kernel expects:
+   rax: system call number
+   rdi: cl_args
+   rsi: size  */
+
+        .text
+ENTRY (__clone3)
+	/* Sanity check arguments.  */
+	movq	$-EINVAL, %rax
+	testq	%rdi, %rdi		/* No NULL cl_args pointer.  */
+	jz	SYSCALL_ERROR_LABEL
+	testq	%rdx, %rdx		/* No NULL function pointer.  */
+	jz	SYSCALL_ERROR_LABEL
+
+	/* Load the new stack size into R9.  */
+	movq	CLONE_ARGS_STACK_SIZE_OFFSET(%rdi), %r9
+
+	/* Load the new stack bottom into R8.  */
+	movq	CLONE_ARGS_STACK_OFFSET(%rdi), %r8
+
+	/* Make room for 2 8-byte slots on the new stack.  */
+	subq	$16, %r9
+
+	/* Insert the argument onto the new stack.  */
+	movq	%rcx, 8(%r8, %r9)
+
+	/* Save the function pointer.  It will be popped off in the
+	   child.  */
+	movq	%rdx, (%r8, %r9)
+
+	/* Update the stack size field passed to clone3.  */
+	movq	%r9, CLONE_ARGS_STACK_SIZE_OFFSET(%rdi)
+
+	/* Do the system call.  */
+	movl	$SYS_ify(clone3), %eax
+
+	/* End FDE now, because in the child the unwind info will be
+	   wrong.  */
+	cfi_endproc;
+	syscall
+
+	test	%RAX_LP, %RAX_LP
+	jl	SYSCALL_ERROR_LABEL
+	jz	L(thread_start)
+
+	ret
+
+L(thread_start):
+	cfi_startproc;
+	/* Clearing frame pointer is insufficient, use CFI.  */
+	cfi_undefined (rip);
+	/* Clear the frame pointer.  The ABI suggests this be done, to mark
+	   the outermost frame obviously.  */
+	xorl	%ebp, %ebp
+
+	/* Set up arguments for the function call.  */
+	popq	%rax		/* Function to call.  */
+	popq	%rdi		/* Argument.  */
+	call	*%rax
+	/* Call exit with return value from function call. */
+	movq	%rax, %rdi
+	movl	$SYS_ify(exit), %eax
+	syscall
+	cfi_endproc;
+
+	cfi_startproc;
+PSEUDO_END (__clone3)
+
+libc_hidden_def (__clone3)
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* PING^1 [PATCH v3] Add an internal wrapper for clone, clone2 and clone3
  2021-03-05 16:59           ` [PATCH v3] " H.J. Lu via Libc-alpha
@ 2021-03-18 23:18             ` H.J. Lu via Libc-alpha
  2021-04-19 12:56             ` H.J. Lu via Libc-alpha
  2021-05-10  8:17             ` Florian Weimer via Libc-alpha
  2 siblings, 0 replies; 14+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-03-18 23:18 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Florian Weimer, H.J. Lu via Libc-alpha

On Fri, Mar 5, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Mar 5, 2021 at 12:39 AM Christian Brauner
> <christian.brauner@ubuntu.com> wrote:
> >
> > On Thu, Mar 04, 2021 at 10:50:29AM -0800, H.J. Lu via Libc-alpha wrote:
> > > On Thu, Mar 4, 2021 at 10:40 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Feb 15, 2021 at 6:56 AM Adhemerval Zanella via Libc-alpha
> > > > <libc-alpha@sourceware.org> wrote:
> > > > >
> > > > >
> > > > >
> > > > > On 15/02/2021 07:44, Florian Weimer via Libc-alpha wrote:
> > > > > > * H. J. Lu via Libc-alpha:
> > > > > >
> > > > > >> The clone3 system call provides a superset of the functionality of clone
> > > > > >> and clone2.  It also provides a number of API improve ments, including
> > > > > >> the ability to specify the size of the child's stack area which can be
> > > > > >> used by kernel to compute the shadow stack size when allocating the
> > > > > >> shadow stack.  Add:
> > > > > >>
> > > > > >> extern int __clone_internal (struct clone_args *cl_args, size_t size,
> > > > > >>                           int (*__fn) (void *__arg), void *__arg);
> > > > > >>
> > > > > >> to provide an abstract interface for clone, clone2 and clone3.
> > > > > >>
> > > > > >> 1. Simplify stack management for clone by passing stack base and size
> > > > > >> to __clone_internal.
> > > > > >> 2. Consolidate clone vs clone2 differences into a single file.
> > > > > >> 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> > > > > >> with ENOSYS, __clone_internal will fall back to clone or clone2.
> > > > > >> 4. Add the x86-64 clone3 wrapper.
> > > > > >> 5. Enable the public clone3 wrapper in the future after it has been
> > > > > >> added to add targets.
> > > > > >
> > > > > > What do you think about providing a clone wrapper which reuses the
> > > > > > caller's stack?  That would be useful for a (v)fork-style clone.  This
> > > > > > variant could also be exported because the callback inherits a
> > > > > > semi-valid TCB in this case.
> > > > > I have this one as a TODO to improve the posix_spawn.
> > > >
> > > > Here is the v2 patch:
> > > >
> > > > 1. Update the stack size field, instead of the stack field, passed to
> > > > clone3.
> > > >
> > > > OK for master?
> > >
> > > Really add the patch this time :-).
> > >
> > > --
> > > H.J.
> >
> > Just a small comment.
> >
>
> > > +      __set_errno (0);
> > > +      /* Map clone3 arguments to clone arguments.  */
> > > +      int flags = cl_args->flags | cl_args->exit_signal;
> > > +      void *stack = (void *) (uintptr_t) cl_args->stack;
> > > +
> > > +#ifdef __ia64__
> > > +      ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,
> > > +                   flags, arg,
> > > +                   (void *) (uintptr_t) cl_args->parent_tid,
> > > +                   (void *) (uintptr_t) cl_args->tls,
> > > +                   (void *) (uintptr_t) cl_args->child_tid);
> >
> > Should glibc maybe just carry a few simple macros to do this instead of
> > open-coding the casting? (We do in systemd and LXC for example.):
> >
> > #define PTR_TO_INT(p) ((int)((intptr_t)(p)))
> > #define INT_TO_PTR(u) ((void *)((intptr_t)(u)))
> >
> > #define PTR_TO_PID(p) ((pid_t)((intptr_t)(p)))
> > #define PID_TO_PTR(u) ((void *)((intptr_t)(u)))
> >
> > #define PTR_TO_UINT64(p) ((uint64_t)((uintptr_t)(p)))
> >
> > #define PTR_TO_U64(p) ((__u64)((uintptr_t)(p)))
> >
> > #define UINT_TO_PTR(u) ((void *) ((uintptr_t) (u)))
> >
> > #define PTR_TO_USHORT(p) ((unsigned short)((uintptr_t)(p)))
> >
>
> Fixed.
>
> Here is the v3 patch.   OK for master?
>
> Changes from v2:
>
> 1. Add cast_to_pointer to cast an integer to void * pointer.
>

PING.

-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* PING^1 [PATCH v3] Add an internal wrapper for clone, clone2 and clone3
  2021-03-05 16:59           ` [PATCH v3] " H.J. Lu via Libc-alpha
  2021-03-18 23:18             ` PING^1 " H.J. Lu via Libc-alpha
@ 2021-04-19 12:56             ` H.J. Lu via Libc-alpha
  2021-05-10  8:17             ` Florian Weimer via Libc-alpha
  2 siblings, 0 replies; 14+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-04-19 12:56 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Florian Weimer, H.J. Lu via Libc-alpha

On Fri, Mar 5, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Mar 5, 2021 at 12:39 AM Christian Brauner
> <christian.brauner@ubuntu.com> wrote:
> >
> > On Thu, Mar 04, 2021 at 10:50:29AM -0800, H.J. Lu via Libc-alpha wrote:
> > > On Thu, Mar 4, 2021 at 10:40 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Feb 15, 2021 at 6:56 AM Adhemerval Zanella via Libc-alpha
> > > > <libc-alpha@sourceware.org> wrote:
> > > > >
> > > > >
> > > > >
> > > > > On 15/02/2021 07:44, Florian Weimer via Libc-alpha wrote:
> > > > > > * H. J. Lu via Libc-alpha:
> > > > > >
> > > > > >> The clone3 system call provides a superset of the functionality of clone
> > > > > >> and clone2.  It also provides a number of API improve ments, including
> > > > > >> the ability to specify the size of the child's stack area which can be
> > > > > >> used by kernel to compute the shadow stack size when allocating the
> > > > > >> shadow stack.  Add:
> > > > > >>
> > > > > >> extern int __clone_internal (struct clone_args *cl_args, size_t size,
> > > > > >>                           int (*__fn) (void *__arg), void *__arg);
> > > > > >>
> > > > > >> to provide an abstract interface for clone, clone2 and clone3.
> > > > > >>
> > > > > >> 1. Simplify stack management for clone by passing stack base and size
> > > > > >> to __clone_internal.
> > > > > >> 2. Consolidate clone vs clone2 differences into a single file.
> > > > > >> 3. Use only __clone_internal to clone a thread.  If clone3 returns -1
> > > > > >> with ENOSYS, __clone_internal will fall back to clone or clone2.
> > > > > >> 4. Add the x86-64 clone3 wrapper.
> > > > > >> 5. Enable the public clone3 wrapper in the future after it has been
> > > > > >> added to add targets.
> > > > > >
> > > > > > What do you think about providing a clone wrapper which reuses the
> > > > > > caller's stack?  That would be useful for a (v)fork-style clone.  This
> > > > > > variant could also be exported because the callback inherits a
> > > > > > semi-valid TCB in this case.
> > > > > I have this one as a TODO to improve the posix_spawn.
> > > >
> > > > Here is the v2 patch:
> > > >
> > > > 1. Update the stack size field, instead of the stack field, passed to
> > > > clone3.
> > > >
> > > > OK for master?
> > >
> > > Really add the patch this time :-).
> > >
> > > --
> > > H.J.
> >
> > Just a small comment.
> >
>
> > > +      __set_errno (0);
> > > +      /* Map clone3 arguments to clone arguments.  */
> > > +      int flags = cl_args->flags | cl_args->exit_signal;
> > > +      void *stack = (void *) (uintptr_t) cl_args->stack;
> > > +
> > > +#ifdef __ia64__
> > > +      ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,
> > > +                   flags, arg,
> > > +                   (void *) (uintptr_t) cl_args->parent_tid,
> > > +                   (void *) (uintptr_t) cl_args->tls,
> > > +                   (void *) (uintptr_t) cl_args->child_tid);
> >
> > Should glibc maybe just carry a few simple macros to do this instead of
> > open-coding the casting? (We do in systemd and LXC for example.):
> >
> > #define PTR_TO_INT(p) ((int)((intptr_t)(p)))
> > #define INT_TO_PTR(u) ((void *)((intptr_t)(u)))
> >
> > #define PTR_TO_PID(p) ((pid_t)((intptr_t)(p)))
> > #define PID_TO_PTR(u) ((void *)((intptr_t)(u)))
> >
> > #define PTR_TO_UINT64(p) ((uint64_t)((uintptr_t)(p)))
> >
> > #define PTR_TO_U64(p) ((__u64)((uintptr_t)(p)))
> >
> > #define UINT_TO_PTR(u) ((void *) ((uintptr_t) (u)))
> >
> > #define PTR_TO_USHORT(p) ((unsigned short)((uintptr_t)(p)))
> >
>
> Fixed.
>
> Here is the v3 patch.   OK for master?
>
> Changes from v2:
>
> 1. Add cast_to_pointer to cast an integer to void * pointer.
>
> --
> H.J.

PING.

-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] Add an internal wrapper for clone, clone2 and clone3
  2021-03-05 16:59           ` [PATCH v3] " H.J. Lu via Libc-alpha
  2021-03-18 23:18             ` PING^1 " H.J. Lu via Libc-alpha
  2021-04-19 12:56             ` H.J. Lu via Libc-alpha
@ 2021-05-10  8:17             ` Florian Weimer via Libc-alpha
  2021-05-10 23:56               ` H.J. Lu via Libc-alpha
  2 siblings, 1 reply; 14+ messages in thread
From: Florian Weimer via Libc-alpha @ 2021-05-10  8:17 UTC (permalink / raw)
  To: H.J. Lu via Libc-alpha; +Cc: Christian Brauner

* H. J. Lu via Libc-alpha:

> diff --git a/include/sched.h b/include/sched.h
> index b0bf971c93..28dd99a571 100644
> --- a/include/sched.h
> +++ b/include/sched.h
> @@ -32,5 +32,8 @@ libc_hidden_proto (__clone2)
>     and Hurd doesn't have it.  */
>  extern int __getcpu (unsigned int *, unsigned int *);
>  libc_hidden_proto (__getcpu)
> +
> +# include <clone-internal.h>
> +libc_hidden_proto (__clone_internal)
>  #endif
>  #endif

I think the libc_hidden_proto should be in <clone3.h> below, and
including the header here shouldn't be necessary.

> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index 149b999603..73c7f33a00 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -34,47 +34,6 @@
>  #include <stack-aliasing.h>
>  
>  
> -#ifndef NEED_SEPARATE_REGISTER_STACK

This conflicts with my libpthread removal work.

Maybe it is possible to do these changes in a separate refactoring?

> diff --git a/sysdeps/generic/clone-internal.h b/sysdeps/generic/clone-internal.h
> new file mode 100644
> index 0000000000..1c1f997ca5
> --- /dev/null
> +++ b/sysdeps/generic/clone-internal.h

This should be in sysdeps/unix/sysv/linux because it is Linux-specific.

I would call it <clone3.h> (still as an internal-only header of course).

> @@ -0,0 +1,49 @@
> +/* The internal wrapper of clone and clone3.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdint.h>
> +#include <stddef.h>
> +
> +struct clone_args
> +{
> +  uint64_t flags;	 /* Flags bit mask.  */
> +  uint64_t pidfd;	 /* Where to store PID file descriptor
> +			    (pid_t *).  */
> +  uint64_t child_tid;	 /* Where to store child TID, in child's memory
> +			    (pid_t *).  */
> +  uint64_t parent_tid;	 /* Where to store child TID, in parent's memory
> +			    (int *). */
> +  uint64_t exit_signal;	 /* Signal to deliver to parent on child
> +			    termination */
> +  uint64_t stack;	 /* The lowest address of stack.  */
> +  uint64_t stack_size;	 /* Size of stack.  */
> +  uint64_t tls;		 /* Location of new TLS.  */
> +  uint64_t set_tid;	 /* Pointer to a pid_t array
> +			    (since Linux 5.5).  */
> +  uint64_t set_tid_size; /* Number of elements in set_tid
> +			    (since Linux 5.5). */
> +  uint64_t cgroup;	 /* File descriptor for target cgroup
> +			    of child (since Linux 5.7).  */
> +};

I think this has to be surrounded by #ifdef CLONE_ARGS_SIZE_VER0 (after
including <linux/sched.h>).  This assumes we actually can include
<linux/sched.h> in the glibc build, which I haven't checked.  But I
think it is worth a try.

If we can't use <linux/sched.h>, it would be safer to rename that
struct, in case kernel headers begin to define it.

> +extern int __clone3 (struct clone_args *cl_args, size_t size,
> +		     int (*__fn) (void *__arg), void *__arg)
> +  __attribute__ ((visibility ("hidden")));

Please use libc_hidden_proto (without the explicit visibility change).

The internal __clone3 call should not have size argument because the
size of struct clone_args is known during the build.  Soon we won't need
GLIBC_PRIVATE, so there is no risk of this backwards-incompatible
interface leaking out to userspace.

Likewise for __clone_internal.

> diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
> new file mode 100644
> index 0000000000..fd9929b6df
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/clone-internal.c
> @@ -0,0 +1,62 @@

> +#include <errno.h>
> +#include <sched.h>
> +#include <libc-pointer-arith.h>	/* For cast_to_pointer.  */
> +#include <stackinfo.h>		/* For _STACK_GROWS_{UP,DOWN}.  */
> +
> +int
> +__clone_internal (struct clone_args *cl_args, size_t size,
> +		  int (*fn)(void *arg), void *arg)
> +{
> +  /* Try clone3 first.  */
> +  int ret = __clone3 (cl_args, size, fn, arg);
> +  if (ret == -1 && errno == ENOSYS)
> +    {
> +      /* NB: Must clear errno since errno may be checked against non-zero
> +         return value.  */
> +      __set_errno (0);

Setting errno to zero doesn't look right.  This should probably restore
the original errno value.

> +      /* Map clone3 arguments to clone arguments.  */
> +      int flags = cl_args->flags | cl_args->exit_signal;
> +      void *stack = cast_to_pointer (cl_args->stack);

I think you should check here if there are flags set outside of 0xff
because the kernel won't.

> +libc_hidden_def (__clone_internal)
> diff --git a/sysdeps/unix/sysv/linux/clone-offsets.sym b/sysdeps/unix/sysv/linux/clone-offsets.sym
> new file mode 100644
> index 0000000000..38d447bcf3
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/clone-offsets.sym
> @@ -0,0 +1,9 @@
> +#include <stddef.h>
> +#include <sched.h>
> +
> +--
> +
> +#define clone_args_offset(member) offsetof (struct clone_args, member)
> +
> +CLONE_ARGS_STACK_OFFSET		clone_args_offset(stack)
> +CLONE_ARGS_STACK_SIZE_OFFSET	clone_args_offset(stack_size)

Missing spaces before '('.

> diff --git a/sysdeps/unix/sysv/linux/tst-align-clone.c b/sysdeps/unix/sysv/linux/tst-align-clone.c
> index 6ace61bac3..5f00b8ee4a 100644
> --- a/sysdeps/unix/sysv/linux/tst-align-clone.c
> +++ b/sysdeps/unix/sysv/linux/tst-align-clone.c
> @@ -23,6 +23,7 @@
>  #include <sys/wait.h>
>  #include <unistd.h>
>  #include <tst-stack-align.h>
> +#include <clone-internal.h>
>  #include <stackinfo.h>
>  
>  static int
> @@ -49,21 +50,17 @@ do_test (void)
>      ok = false;
>  
>  #ifdef __ia64__
> -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		       size_t __child_stack_size, int __flags,
> -		       void *__arg, ...);
> -  char st[256 * 1024];
> -  pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
> +# define STACK_SIZE 256 * 1024
>  #else
> -  char st[128 * 1024] __attribute__ ((aligned));
> -# if _STACK_GROWS_DOWN
> -  pid_t p = clone (f, st + sizeof (st), 0, 0);
> -# elif _STACK_GROWS_UP
> -  pid_t p = clone (f, st, 0, 0);
> -# else
> -#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> -# endif
> +# define STACK_SIZE 128 * 1024
>  #endif
> +  char st[STACK_SIZE] __attribute__ ((aligned));
> +  struct clone_args clone_args =
> +    {
> +      .stack = (uintptr_t) st,
> +      .stack_size = sizeof (st),
> +    };
> +  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
>    if (p == -1)
>      {
>        printf("clone failed: %m\n");

Please do not remvoe coverage of the clone/__clone2 interface from the
test.  Either test both here or add a new test.

The same comment applies to the other tests as well.


> diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> new file mode 100644
> index 0000000000..4d99b4b881
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> @@ -0,0 +1,105 @@
> +/* The clone3 syscall wrapper.  Linux/x86-64 version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* clone3() is even more special than fork() as it mucks with stacks
> +   and invokes a function in the right context after its all over.  */
> +
> +#include <sysdep.h>
> +#include <clone-offsets.h>
> +
> +/* The userland implementation is:
> +   int clone3 (struct clone_args *cl_args, size_t size,
> +	       int (*fn)(void *arg), void *arg);
> +   the kernel entry is:
> +   int clone3 (struct clone_args *cl_args, size_t size);
> +
> +   The parameters are passed in registers from userland:
> +   rdi: cl_args
> +   rsi: size
> +   rdx: fn
> +   rcx: arg
> +
> +   The kernel expects:
> +   rax: system call number
> +   rdi: cl_args
> +   rsi: size  */
> +
> +        .text
> +ENTRY (__clone3)
> +	/* Sanity check arguments.  */
> +	movq	$-EINVAL, %rax
> +	testq	%rdi, %rdi		/* No NULL cl_args pointer.  */
> +	jz	SYSCALL_ERROR_LABEL
> +	testq	%rdx, %rdx		/* No NULL function pointer.  */
> +	jz	SYSCALL_ERROR_LABEL

I don't think we need the error checking for the internal usage.

> +	/* Load the new stack size into R9.  */
> +	movq	CLONE_ARGS_STACK_SIZE_OFFSET(%rdi), %r9
> +
> +	/* Load the new stack bottom into R8.  */
> +	movq	CLONE_ARGS_STACK_OFFSET(%rdi), %r8
> +
> +	/* Make room for 2 8-byte slots on the new stack.  */
> +	subq	$16, %r9
> +
> +	/* Insert the argument onto the new stack.  */
> +	movq	%rcx, 8(%r8, %r9)
> +
> +	/* Save the function pointer.  It will be popped off in the
> +	   child.  */
> +	movq	%rdx, (%r8, %r9)
> +
> +	/* Update the stack size field passed to clone3.  */
> +	movq	%r9, CLONE_ARGS_STACK_SIZE_OFFSET(%rdi)

Why is this necessary?  This is rather surprising.  I think you should
avoid modification here.  Can you pass the data to thread_start in
registers instead?

Thanks,
Florian


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] Add an internal wrapper for clone, clone2 and clone3
  2021-05-10  8:17             ` Florian Weimer via Libc-alpha
@ 2021-05-10 23:56               ` H.J. Lu via Libc-alpha
  0 siblings, 0 replies; 14+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-05-10 23:56 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Christian Brauner, H.J. Lu via Libc-alpha

On Mon, May 10, 2021 at 1:17 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu via Libc-alpha:
>
> > diff --git a/include/sched.h b/include/sched.h
> > index b0bf971c93..28dd99a571 100644
> > --- a/include/sched.h
> > +++ b/include/sched.h
> > @@ -32,5 +32,8 @@ libc_hidden_proto (__clone2)
> >     and Hurd doesn't have it.  */
> >  extern int __getcpu (unsigned int *, unsigned int *);
> >  libc_hidden_proto (__getcpu)
> > +
> > +# include <clone-internal.h>
> > +libc_hidden_proto (__clone_internal)
> >  #endif
> >  #endif
>
> I think the libc_hidden_proto should be in <clone3.h> below, and
> including the header here shouldn't be necessary.

Fixed.

> > diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> > index 149b999603..73c7f33a00 100644
> > --- a/nptl/allocatestack.c
> > +++ b/nptl/allocatestack.c
> > @@ -34,47 +34,6 @@
> >  #include <stack-aliasing.h>
> >
> >
> > -#ifndef NEED_SEPARATE_REGISTER_STACK
>
> This conflicts with my libpthread removal work.
>
> Maybe it is possible to do these changes in a separate refactoring?

Fixed.

> > diff --git a/sysdeps/generic/clone-internal.h b/sysdeps/generic/clone-internal.h
> > new file mode 100644
> > index 0000000000..1c1f997ca5
> > --- /dev/null
> > +++ b/sysdeps/generic/clone-internal.h
>
> This should be in sysdeps/unix/sysv/linux because it is Linux-specific.
>
> I would call it <clone3.h> (still as an internal-only header of course).

Fixed.

> > @@ -0,0 +1,49 @@
> > +/* The internal wrapper of clone and clone3.
> > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <stdint.h>
> > +#include <stddef.h>
> > +
> > +struct clone_args
> > +{
> > +  uint64_t flags;     /* Flags bit mask.  */
> > +  uint64_t pidfd;     /* Where to store PID file descriptor
> > +                         (pid_t *).  */
> > +  uint64_t child_tid;         /* Where to store child TID, in child's memory
> > +                         (pid_t *).  */
> > +  uint64_t parent_tid;        /* Where to store child TID, in parent's memory
> > +                         (int *). */
> > +  uint64_t exit_signal;       /* Signal to deliver to parent on child
> > +                         termination */
> > +  uint64_t stack;     /* The lowest address of stack.  */
> > +  uint64_t stack_size;        /* Size of stack.  */
> > +  uint64_t tls;               /* Location of new TLS.  */
> > +  uint64_t set_tid;   /* Pointer to a pid_t array
> > +                         (since Linux 5.5).  */
> > +  uint64_t set_tid_size; /* Number of elements in set_tid
> > +                         (since Linux 5.5). */
> > +  uint64_t cgroup;    /* File descriptor for target cgroup
> > +                         of child (since Linux 5.7).  */
> > +};
>
> I think this has to be surrounded by #ifdef CLONE_ARGS_SIZE_VER0 (after
> including <linux/sched.h>).  This assumes we actually can include
> <linux/sched.h> in the glibc build, which I haven't checked.  But I
> think it is worth a try.
>
> If we can't use <linux/sched.h>, it would be safer to rename that
> struct, in case kernel headers begin to define it.

Fixed.

> > +extern int __clone3 (struct clone_args *cl_args, size_t size,
> > +                  int (*__fn) (void *__arg), void *__arg)
> > +  __attribute__ ((visibility ("hidden")));
>
> Please use libc_hidden_proto (without the explicit visibility change).

Fixed.

> The internal __clone3 call should not have size argument because the
> size of struct clone_args is known during the build.  Soon we won't need
> GLIBC_PRIVATE, so there is no risk of this backwards-incompatible
> interface leaking out to userspace.

Fixed.  __clone3 is only for internal use at the moment.  But we can export
it later if needed.

> Likewise for __clone_internal.

Fixed.  __clone_internal is kept in GLIBC_PRIVATE for glibc clone
tests.

> > diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
> > new file mode 100644
> > index 0000000000..fd9929b6df
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/clone-internal.c
> > @@ -0,0 +1,62 @@
>
> > +#include <errno.h>
> > +#include <sched.h>
> > +#include <libc-pointer-arith.h>      /* For cast_to_pointer.  */
> > +#include <stackinfo.h>               /* For _STACK_GROWS_{UP,DOWN}.  */
> > +
> > +int
> > +__clone_internal (struct clone_args *cl_args, size_t size,
> > +               int (*fn)(void *arg), void *arg)
> > +{
> > +  /* Try clone3 first.  */
> > +  int ret = __clone3 (cl_args, size, fn, arg);
> > +  if (ret == -1 && errno == ENOSYS)
> > +    {
> > +      /* NB: Must clear errno since errno may be checked against non-zero
> > +         return value.  */
> > +      __set_errno (0);
>
> Setting errno to zero doesn't look right.  This should probably restore
> the original errno value.

Fixed.

> > +      /* Map clone3 arguments to clone arguments.  */
> > +      int flags = cl_args->flags | cl_args->exit_signal;
> > +      void *stack = cast_to_pointer (cl_args->stack);
>
> I think you should check here if there are flags set outside of 0xff
> because the kernel won't.

Fixed.

> > +libc_hidden_def (__clone_internal)
> > diff --git a/sysdeps/unix/sysv/linux/clone-offsets.sym b/sysdeps/unix/sysv/linux/clone-offsets.sym
> > new file mode 100644
> > index 0000000000..38d447bcf3
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/clone-offsets.sym
> > @@ -0,0 +1,9 @@
> > +#include <stddef.h>
> > +#include <sched.h>
> > +
> > +--
> > +
> > +#define clone_args_offset(member) offsetof (struct clone_args, member)
> > +
> > +CLONE_ARGS_STACK_OFFSET              clone_args_offset(stack)
> > +CLONE_ARGS_STACK_SIZE_OFFSET clone_args_offset(stack_size)
>
> Missing spaces before '('.

Fixed.

> > diff --git a/sysdeps/unix/sysv/linux/tst-align-clone.c b/sysdeps/unix/sysv/linux/tst-align-clone.c
> > index 6ace61bac3..5f00b8ee4a 100644
> > --- a/sysdeps/unix/sysv/linux/tst-align-clone.c
> > +++ b/sysdeps/unix/sysv/linux/tst-align-clone.c
> > @@ -23,6 +23,7 @@
> >  #include <sys/wait.h>
> >  #include <unistd.h>
> >  #include <tst-stack-align.h>
> > +#include <clone-internal.h>
> >  #include <stackinfo.h>
> >
> >  static int
> > @@ -49,21 +50,17 @@ do_test (void)
> >      ok = false;
> >
> >  #ifdef __ia64__
> > -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> > -                    size_t __child_stack_size, int __flags,
> > -                    void *__arg, ...);
> > -  char st[256 * 1024];
> > -  pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
> > +# define STACK_SIZE 256 * 1024
> >  #else
> > -  char st[128 * 1024] __attribute__ ((aligned));
> > -# if _STACK_GROWS_DOWN
> > -  pid_t p = clone (f, st + sizeof (st), 0, 0);
> > -# elif _STACK_GROWS_UP
> > -  pid_t p = clone (f, st, 0, 0);
> > -# else
> > -#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> > -# endif
> > +# define STACK_SIZE 128 * 1024
> >  #endif
> > +  char st[STACK_SIZE] __attribute__ ((aligned));
> > +  struct clone_args clone_args =
> > +    {
> > +      .stack = (uintptr_t) st,
> > +      .stack_size = sizeof (st),
> > +    };
> > +  pid_t p = __clone_internal (&clone_args, sizeof (clone_args), f, 0);
> >    if (p == -1)
> >      {
> >        printf("clone failed: %m\n");
>
> Please do not remvoe coverage of the clone/__clone2 interface from the
> test.  Either test both here or add a new test.
>
> The same comment applies to the other tests as well.

Fixed.

>
> > diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> > new file mode 100644
> > index 0000000000..4d99b4b881
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> > @@ -0,0 +1,105 @@
> > +/* The clone3 syscall wrapper.  Linux/x86-64 version.
> > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* clone3() is even more special than fork() as it mucks with stacks
> > +   and invokes a function in the right context after its all over.  */
> > +
> > +#include <sysdep.h>
> > +#include <clone-offsets.h>
> > +
> > +/* The userland implementation is:
> > +   int clone3 (struct clone_args *cl_args, size_t size,
> > +            int (*fn)(void *arg), void *arg);
> > +   the kernel entry is:
> > +   int clone3 (struct clone_args *cl_args, size_t size);
> > +
> > +   The parameters are passed in registers from userland:
> > +   rdi: cl_args
> > +   rsi: size
> > +   rdx: fn
> > +   rcx: arg
> > +
> > +   The kernel expects:
> > +   rax: system call number
> > +   rdi: cl_args
> > +   rsi: size  */
> > +
> > +        .text
> > +ENTRY (__clone3)
> > +     /* Sanity check arguments.  */
> > +     movq    $-EINVAL, %rax
> > +     testq   %rdi, %rdi              /* No NULL cl_args pointer.  */
> > +     jz      SYSCALL_ERROR_LABEL
> > +     testq   %rdx, %rdx              /* No NULL function pointer.  */
> > +     jz      SYSCALL_ERROR_LABEL
>
> I don't think we need the error checking for the internal usage.

The clone3 wrapper can be for external use, just like clone.

> > +     /* Load the new stack size into R9.  */
> > +     movq    CLONE_ARGS_STACK_SIZE_OFFSET(%rdi), %r9
> > +
> > +     /* Load the new stack bottom into R8.  */
> > +     movq    CLONE_ARGS_STACK_OFFSET(%rdi), %r8
> > +
> > +     /* Make room for 2 8-byte slots on the new stack.  */
> > +     subq    $16, %r9
> > +
> > +     /* Insert the argument onto the new stack.  */
> > +     movq    %rcx, 8(%r8, %r9)
> > +
> > +     /* Save the function pointer.  It will be popped off in the
> > +        child.  */
> > +     movq    %rdx, (%r8, %r9)
> > +
> > +     /* Update the stack size field passed to clone3.  */
> > +     movq    %r9, CLONE_ARGS_STACK_SIZE_OFFSET(%rdi)
>
> Why is this necessary?  This is rather surprising.  I think you should
> avoid modification here.  Can you pass the data to thread_start in
> registers instead?
>

Fixed.

I will submit the v4 patch.


-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2021-05-10 23:57 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-14 22:45 [PATCH] Add an internal wrapper for clone, clone2 and clone3 H.J. Lu via Libc-alpha
2021-02-15 10:44 ` Florian Weimer via Libc-alpha
2021-02-15 14:17   ` H.J. Lu via Libc-alpha
2021-02-15 14:27     ` Florian Weimer via Libc-alpha
2021-02-15 14:44       ` H.J. Lu via Libc-alpha
2021-02-15 14:39   ` Adhemerval Zanella via Libc-alpha
2021-03-04 18:40     ` [PATCH v2] " H.J. Lu via Libc-alpha
2021-03-04 18:50       ` H.J. Lu via Libc-alpha
2021-03-05  8:32         ` Christian Brauner
2021-03-05 16:59           ` [PATCH v3] " H.J. Lu via Libc-alpha
2021-03-18 23:18             ` PING^1 " H.J. Lu via Libc-alpha
2021-04-19 12:56             ` H.J. Lu via Libc-alpha
2021-05-10  8:17             ` Florian Weimer via Libc-alpha
2021-05-10 23:56               ` H.J. Lu via Libc-alpha

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).