diff --git a/NEWS b/NEWS index 485b8ddffa..d138a45519 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,30 @@ See the end for copying conditions. Please send GNU C library bug reports via using `glibc' in the "product" field. +The following bugs are resolved with this release: + + [20019] NULL pointer dereference in libc.so.6 IFUNC due to uninitialized GOT + [26224] iconv hangs when converting some invalid inputs from several IBM + character sets (CVE-2020-27618) + [26534] libm.so 2.32 SIGILL in pow() due to FMA4 instruction on non-FMA4 + system + [26555] string: strerrorname_np does not return the documented value + [26600] Transaction ID collisions cause slow DNS lookups in getaddrinfo + [26636] libc: 32-bit shmctl(IPC_INFO) crashes when shminfo struct is + at the end of a memory mapping + [26637] libc: semctl SEM_STAT_ANY fails to pass the buffer specified + by the caller to the kernel + [26639] libc: msgctl IPC_INFO and MSG_INFO return garbage + [26853] aarch64: Missing unwind information in statically linked startup code + [26932] libc: sh: Multiple floating point functions defined as stubs only + [27130] "rep movsb" performance issue + [27177] GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on doesn't work + [27457] vzeroupper use in AVX2 multiarch string functions cause HTM aborts + [27974] Overflow bug in some implementation of wcsnlen, wmemchr, and wcsncat + [28524] Conversion from ISO-2022-JP-3 with iconv may emit spurious NULs + [28607] Masked signals are delivered on thread exit + [28755] overflow bug in wcsncmp_avx2 and wcsncmp_evex + Version 2.32 Major new features: @@ -185,6 +209,14 @@ Security related changes: Dytrych of the Cisco Security Assessment and Penetration Team (See TALOS-2020-1019). + CVE-2020-27618: An infinite loop has been fixed in the iconv program when + invoked with input containing redundant shift sequences in the IBM1364, + IBM1371, IBM1388, IBM1390, or IBM1399 character sets. + + CVE-2021-33574: The mq_notify function has a potential use-after-free + issue when using a notification type of SIGEV_THREAD and a thread + attribute with a non-default affinity mask. + The following bugs are resolved with this release: [9809] localedata: ckb_IQ: new Kurdish Sorani locale diff --git a/Rules b/Rules index 8b771f6095..beab969fde 100644 --- a/Rules +++ b/Rules @@ -155,6 +155,7 @@ xtests: tests $(xtests-special) else tests: $(tests:%=$(objpfx)%.out) $(tests-internal:%=$(objpfx)%.out) \ $(tests-container:%=$(objpfx)%.out) \ + $(tests-mcheck:%=$(objpfx)%-mcheck.out) \ $(tests-special) $(tests-printers-out) xtests: tests $(xtests:%=$(objpfx)%.out) $(xtests-special) endif @@ -165,7 +166,7 @@ ifeq ($(run-built-tests),no) tests-expected = else tests-expected = $(tests) $(tests-internal) $(tests-printers) \ - $(tests-container) + $(tests-container) $(tests-mcheck:%=%-mcheck) endif tests: $(..)scripts/merge-test-results.sh -s $(objpfx) $(subdir) \ @@ -191,6 +192,7 @@ else binaries-pie-tests = binaries-pie-notests = endif +binaries-mcheck-tests = $(tests-mcheck:%=%-mcheck) else binaries-all-notests = binaries-all-tests = $(tests) $(tests-internal) $(xtests) $(test-srcs) @@ -200,6 +202,7 @@ binaries-static-tests = binaries-static = binaries-pie-tests = binaries-pie-notests = +binaries-mcheck-tests = endif binaries-pie = $(binaries-pie-tests) $(binaries-pie-notests) @@ -223,6 +226,14 @@ $(addprefix $(objpfx),$(binaries-shared-tests)): %: %.o \ $(+link-tests) endif +ifneq "$(strip $(binaries-mcheck-tests))" "" +$(addprefix $(objpfx),$(binaries-mcheck-tests)): %-mcheck: %.o \ + $(link-extra-libs-tests) \ + $(sort $(filter $(common-objpfx)lib%,$(link-libc))) \ + $(addprefix $(csu-objpfx),start.o) $(+preinit) $(+postinit) + $(+link-tests) +endif + ifneq "$(strip $(binaries-pie-tests))" "" $(addprefix $(objpfx),$(binaries-pie-tests)): %: %.o \ $(link-extra-libs-tests) \ @@ -253,6 +264,12 @@ $(addprefix $(objpfx),$(binaries-static-tests)): %: %.o \ $(+link-static-tests) endif +# All mcheck tests will be run with MALLOC_CHECK_=3 +define mcheck-ENVS +$(1)-mcheck-ENV = MALLOC_CHECK_=3 +endef +$(foreach t,$(tests-mcheck),$(eval $(call mcheck-ENVS,$(t)))) + ifneq "$(strip $(tests) $(tests-internal) $(xtests) $(test-srcs))" "" # These are the implicit rules for making test outputs # from the test programs and whatever input files are present. diff --git a/debug/Makefile b/debug/Makefile index 3a60d7af7a..0036edd187 100644 --- a/debug/Makefile +++ b/debug/Makefile @@ -51,7 +51,7 @@ routines = backtrace backtracesyms backtracesymsfd noophooks \ explicit_bzero_chk \ stack_chk_fail fortify_fail \ $(static-only-routines) -static-only-routines := warning-nop stack_chk_fail_local +static-only-routines := stack_chk_fail_local # Don't add stack_chk_fail_local.o to libc.a since __stack_chk_fail_local # is an alias of __stack_chk_fail in stack_chk_fail.o. diff --git a/debug/warning-nop.c b/debug/warning-nop.c deleted file mode 100644 index 4ab7e182b7..0000000000 --- a/debug/warning-nop.c +++ /dev/null @@ -1,70 +0,0 @@ -/* Dummy nop functions to elicit link-time warnings. - Copyright (C) 2005-2020 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - In addition to the permissions in the GNU Lesser General Public - License, the Free Software Foundation gives you unlimited - permission to link the compiled version of this file with other - programs, and to distribute those programs without any restriction - coming from the use of this file. (The GNU Lesser General Public - License restrictions do apply in other respects; for example, they - cover modification of the file, and distribution when not linked - into another program.) - - Note that people who make modified versions of this file are not - obligated to grant this special exception for their modified - versions; it is their choice whether to do so. The GNU Lesser - General Public License gives permission to release a modified - version without this exception; this exception also makes it - possible to release a modified version which carries forward this - exception. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - -static void -__attribute__ ((used)) -nop (void) -{ -} - -/* Don't insert any other #include's before this #undef! */ - -#undef __warndecl -#define __warndecl(name, msg) \ - extern void name (void) __attribute__ ((alias ("nop"))) attribute_hidden; \ - link_warning (name, msg) - -#undef __USE_FORTIFY_LEVEL -#define __USE_FORTIFY_LEVEL 99 - -/* Following here we need an #include for each public header file - that uses __warndecl. */ - -/* Define away to avoid warnings with compilers that do not have these - builtins. */ -#define __builtin___memcpy_chk(dest, src, len, bos) NULL -#define __builtin___memmove_chk(dest, src, len, bos) NULL -#define __builtin___mempcpy_chk(dest, src, len, bos) NULL -#define __builtin___memset_chk(dest, ch, len, bos) NULL -#define __builtin___stpcpy_chk(dest, src, bos) NULL -#define __builtin___strcat_chk(dest, src, bos) NULL -#define __builtin___strcpy_chk(dest, src, bos) NULL -#define __builtin___strncat_chk(dest, src, len, bos) NULL -#define __builtin___strncpy_chk(dest, src, len, bos) NULL -#define __builtin_object_size(bos, level) 0 - -#include diff --git a/elf/Makefile b/elf/Makefile index 0b78721848..3ba7f4ecfc 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -1381,6 +1381,8 @@ CFLAGS-ifuncmain7pie.c += $(pie-ccflag) CFLAGS-ifuncmain9pie.c += $(pie-ccflag) CFLAGS-tst-ifunc-textrel.c += $(pic-ccflag) +LDFLAGS-ifuncmain6pie = -Wl,-z,lazy + $(objpfx)ifuncmain1pie: $(objpfx)ifuncmod1.so $(objpfx)ifuncmain1staticpie: $(objpfx)ifuncdep1pic.o $(objpfx)ifuncmain1vispie: $(objpfx)ifuncmod1.so @@ -1630,8 +1632,6 @@ $(objpfx)tst-nodelete-dlclose.out: $(objpfx)tst-nodelete-dlclose-dso.so \ tst-env-setuid-ENV = MALLOC_CHECK_=2 MALLOC_MMAP_THRESHOLD_=4096 \ LD_HWCAP_MASK=0x1 -tst-env-setuid-tunables-ENV = \ - GLIBC_TUNABLES=glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096 $(objpfx)tst-debug1: $(libdl) $(objpfx)tst-debug1.out: $(objpfx)tst-debug1mod1.so diff --git a/elf/dl-load.c b/elf/dl-load.c index e39980fb19..71867e7c1a 100644 --- a/elf/dl-load.c +++ b/elf/dl-load.c @@ -855,10 +855,12 @@ lose (int code, int fd, const char *name, char *realname, struct link_map *l, /* Process PT_GNU_PROPERTY program header PH in module L after PT_LOAD segments are mapped. Only one NT_GNU_PROPERTY_TYPE_0 - note is handled which contains processor specific properties. */ + note is handled which contains processor specific properties. + FD is -1 for the kernel mapped main executable otherwise it is + the fd used for loading module L. */ void -_dl_process_pt_gnu_property (struct link_map *l, const ElfW(Phdr) *ph) +_dl_process_pt_gnu_property (struct link_map *l, int fd, const ElfW(Phdr) *ph) { const ElfW(Nhdr) *note = (const void *) (ph->p_vaddr + l->l_addr); const ElfW(Addr) size = ph->p_memsz; @@ -905,7 +907,7 @@ _dl_process_pt_gnu_property (struct link_map *l, const ElfW(Phdr) *ph) last_type = type; /* Target specific property processing. */ - if (_dl_process_gnu_property (l, type, datasz, ptr) == 0) + if (_dl_process_gnu_property (l, fd, type, datasz, ptr) == 0) return; /* Check the next property item. */ @@ -1251,21 +1253,6 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd, maplength, has_holes, loader); if (__glibc_unlikely (errstring != NULL)) goto call_lose; - - /* Process program headers again after load segments are mapped in - case processing requires accessing those segments. Scan program - headers backward so that PT_NOTE can be skipped if PT_GNU_PROPERTY - exits. */ - for (ph = &phdr[l->l_phnum]; ph != phdr; --ph) - switch (ph[-1].p_type) - { - case PT_NOTE: - _dl_process_pt_note (l, &ph[-1]); - break; - case PT_GNU_PROPERTY: - _dl_process_pt_gnu_property (l, &ph[-1]); - break; - } } if (l->l_ld == 0) @@ -1377,6 +1364,21 @@ cannot enable executable stack as shared object requires"); if (l->l_tls_initimage != NULL) l->l_tls_initimage = (char *) l->l_tls_initimage + l->l_addr; + /* Process program headers again after load segments are mapped in + case processing requires accessing those segments. Scan program + headers backward so that PT_NOTE can be skipped if PT_GNU_PROPERTY + exits. */ + for (ph = &l->l_phdr[l->l_phnum]; ph != l->l_phdr; --ph) + switch (ph[-1].p_type) + { + case PT_NOTE: + _dl_process_pt_note (l, fd, &ph[-1]); + break; + case PT_GNU_PROPERTY: + _dl_process_pt_gnu_property (l, fd, &ph[-1]); + break; + } + /* We are done mapping in the file. We no longer need the descriptor. */ if (__glibc_unlikely (__close_nocancel (fd) != 0)) { diff --git a/elf/dl-open.c b/elf/dl-open.c index 8769e47051..55b39e1bbe 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -887,7 +887,7 @@ no more namespaces available for dlmopen()")); /* Avoid keeping around a dangling reference to the libc.so link map in case it has been cached in libc_map. */ if (!args.libc_already_loaded) - GL(dl_ns)[nsid].libc_map = NULL; + GL(dl_ns)[args.nsid].libc_map = NULL; /* Remove the object from memory. It may be in an inconsistent state if relocation failed, for example. */ diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c index 26e6e26612..15b29bcb90 100644 --- a/elf/dl-tunables.c +++ b/elf/dl-tunables.c @@ -177,6 +177,7 @@ parse_tunables (char *tunestr, char *valstring) return; char *p = tunestr; + size_t off = 0; while (true) { @@ -190,7 +191,11 @@ parse_tunables (char *tunestr, char *valstring) /* If we reach the end of the string before getting a valid name-value pair, bail out. */ if (p[len] == '\0') - return; + { + if (__libc_enable_secure) + tunestr[off] = '\0'; + return; + } /* We did not find a valid name-value pair before encountering the colon. */ @@ -216,35 +221,28 @@ parse_tunables (char *tunestr, char *valstring) if (tunable_is_name (cur->name, name)) { - /* If we are in a secure context (AT_SECURE) then ignore the tunable - unless it is explicitly marked as secure. Tunable values take - precedence over their envvar aliases. */ + /* If we are in a secure context (AT_SECURE) then ignore the + tunable unless it is explicitly marked as secure. Tunable + values take precedence over their envvar aliases. We write + the tunables that are not SXID_ERASE back to TUNESTR, thus + dropping all SXID_ERASE tunables and any invalid or + unrecognized tunables. */ if (__libc_enable_secure) { - if (cur->security_level == TUNABLE_SECLEVEL_SXID_ERASE) + if (cur->security_level != TUNABLE_SECLEVEL_SXID_ERASE) { - if (p[len] == '\0') - { - /* Last tunable in the valstring. Null-terminate and - return. */ - *name = '\0'; - return; - } - else - { - /* Remove the current tunable from the string. We do - this by overwriting the string starting from NAME - (which is where the current tunable begins) with - the remainder of the string. We then have P point - to NAME so that we continue in the correct - position in the valstring. */ - char *q = &p[len + 1]; - p = name; - while (*q != '\0') - *name++ = *q++; - name[0] = '\0'; - len = 0; - } + if (off > 0) + tunestr[off++] = ':'; + + const char *n = cur->name; + + while (*n != '\0') + tunestr[off++] = *n++; + + tunestr[off++] = '='; + + for (size_t j = 0; j < len; j++) + tunestr[off++] = value[j]; } if (cur->security_level != TUNABLE_SECLEVEL_NONE) @@ -257,9 +255,7 @@ parse_tunables (char *tunestr, char *valstring) } } - if (p[len] == '\0') - return; - else + if (p[len] != '\0') p += len + 1; } } diff --git a/elf/ifuncmain6pie.c b/elf/ifuncmain6pie.c index 04faeb86ef..4a01906836 100644 --- a/elf/ifuncmain6pie.c +++ b/elf/ifuncmain6pie.c @@ -9,7 +9,6 @@ #include "ifunc-sel.h" typedef int (*foo_p) (void); -extern foo_p foo_ptr; static int one (void) @@ -28,20 +27,17 @@ foo_ifunc (void) } extern int foo (void); -extern foo_p get_foo (void); +extern int call_foo (void); extern foo_p get_foo_p (void); -foo_p my_foo_ptr = foo; +foo_p foo_ptr = foo; int main (void) { foo_p p; - p = get_foo (); - if (p != foo) - abort (); - if ((*p) () != -30) + if (call_foo () != -30) abort (); p = get_foo_p (); @@ -52,12 +48,8 @@ main (void) if (foo_ptr != foo) abort (); - if (my_foo_ptr != foo) - abort (); if ((*foo_ptr) () != -30) abort (); - if ((*my_foo_ptr) () != -30) - abort (); if (foo () != -30) abort (); diff --git a/elf/ifuncmod6.c b/elf/ifuncmod6.c index 2e16c1d06d..2f6d0715e6 100644 --- a/elf/ifuncmod6.c +++ b/elf/ifuncmod6.c @@ -4,7 +4,7 @@ extern int foo (void); typedef int (*foo_p) (void); -foo_p foo_ptr = foo; +extern foo_p foo_ptr; foo_p get_foo_p (void) @@ -12,8 +12,8 @@ get_foo_p (void) return foo_ptr; } -foo_p -get_foo (void) +int +call_foo (void) { - return foo; + return foo (); } diff --git a/elf/rtld.c b/elf/rtld.c index 5b882163fa..14a42ed00a 100644 --- a/elf/rtld.c +++ b/elf/rtld.c @@ -1534,10 +1534,10 @@ of this helper program; chances are you did not intend to run this program.\n\ switch (ph[-1].p_type) { case PT_NOTE: - _dl_process_pt_note (main_map, &ph[-1]); + _dl_process_pt_note (main_map, -1, &ph[-1]); break; case PT_GNU_PROPERTY: - _dl_process_pt_gnu_property (main_map, &ph[-1]); + _dl_process_pt_gnu_property (main_map, -1, &ph[-1]); break; } diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c index 971d5892b1..ca0c8c245c 100644 --- a/elf/tst-env-setuid-tunables.c +++ b/elf/tst-env-setuid-tunables.c @@ -25,35 +25,76 @@ #include "config.h" #undef _LIBC -#define test_parent test_parent_tunables -#define test_child test_child_tunables - -static int test_child_tunables (void); -static int test_parent_tunables (void); - -#include "tst-env-setuid.c" - -#define CHILD_VALSTRING_VALUE "glibc.malloc.mmap_threshold=4096" -#define PARENT_VALSTRING_VALUE \ - "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +const char *teststrings[] = +{ + "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.check=2:glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096:glibc.malloc.check=2", + "glibc.malloc.perturb=0x800", + "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.perturb=0x800:not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096", + "glibc.not_valid.check=2:glibc.malloc.mmap_threshold=4096", + "not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096:glibc.malloc.check=2", + "glibc.malloc.check=4:glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096", + ":glibc.malloc.garbage=2:glibc.malloc.check=1", + "glibc.malloc.check=1:glibc.malloc.check=2", + "not_valid.malloc.check=2", + "glibc.not_valid.check=2", +}; + +const char *resultstrings[] = +{ + "glibc.malloc.mmap_threshold=4096", + "glibc.malloc.mmap_threshold=4096", + "glibc.malloc.mmap_threshold=4096", + "glibc.malloc.perturb=0x800", + "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.mmap_threshold=4096", + "glibc.malloc.mmap_threshold=4096", + "", + "", + "", + "", + "", + "", +}; static int -test_child_tunables (void) +test_child (int off) { const char *val = getenv ("GLIBC_TUNABLES"); #if HAVE_TUNABLES - if (val != NULL && strcmp (val, CHILD_VALSTRING_VALUE) == 0) + if (val != NULL && strcmp (val, resultstrings[off]) == 0) return 0; if (val != NULL) - printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val); + printf ("[%d] Unexpected GLIBC_TUNABLES VALUE %s\n", off, val); return 1; #else if (val != NULL) { - printf ("GLIBC_TUNABLES not cleared\n"); + printf ("[%d] GLIBC_TUNABLES not cleared\n", off); return 1; } return 0; @@ -61,15 +102,48 @@ test_child_tunables (void) } static int -test_parent_tunables (void) +do_test (int argc, char **argv) { - const char *val = getenv ("GLIBC_TUNABLES"); + /* Setgid child process. */ + if (argc == 2) + { + if (getgid () == getegid ()) + /* This can happen if the file system is mounted nosuid. */ + FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", + (intmax_t) getgid ()); - if (val != NULL && strcmp (val, PARENT_VALSTRING_VALUE) == 0) - return 0; + int ret = test_child (atoi (argv[1])); - if (val != NULL) - printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val); + if (ret != 0) + exit (1); - return 1; + exit (EXIT_SUCCESS); + } + else + { + int ret = 0; + + /* Spawn tests. */ + for (int i = 0; i < array_length (teststrings); i++) + { + char buf[INT_BUFSIZE_BOUND (int)]; + + printf ("Spawned test for %s (%d)\n", teststrings[i], i); + snprintf (buf, sizeof (buf), "%d\n", i); + if (setenv ("GLIBC_TUNABLES", teststrings[i], 1) != 0) + exit (1); + + int status = support_capture_subprogram_self_sgid (buf); + + /* Bail out early if unsupported. */ + if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) + return EXIT_UNSUPPORTED; + + ret |= status; + } + return ret; + } } + +#define TEST_FUNCTION_ARGV do_test +#include diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c index 41dc79e83a..2dbccdb69e 100644 --- a/elf/tst-env-setuid.c +++ b/elf/tst-env-setuid.c @@ -29,173 +29,12 @@ #include #include +#include #include #include +#include static char SETGID_CHILD[] = "setgid-child"; -#define CHILD_STATUS 42 - -/* Return a GID which is not our current GID, but is present in the - supplementary group list. */ -static gid_t -choose_gid (void) -{ - const int count = 64; - gid_t groups[count]; - int ret = getgroups (count, groups); - if (ret < 0) - { - printf ("getgroups: %m\n"); - exit (1); - } - gid_t current = getgid (); - for (int i = 0; i < ret; ++i) - { - if (groups[i] != current) - return groups[i]; - } - return 0; -} - -/* Spawn and execute a program and verify that it returns the CHILD_STATUS. */ -static pid_t -do_execve (char **args) -{ - pid_t kid = vfork (); - - if (kid < 0) - { - printf ("vfork: %m\n"); - return -1; - } - - if (kid == 0) - { - /* Child process. */ - execve (args[0], args, environ); - _exit (-errno); - } - - if (kid < 0) - return 1; - - int status; - - if (waitpid (kid, &status, 0) < 0) - { - printf ("waitpid: %m\n"); - return 1; - } - - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) - return EXIT_UNSUPPORTED; - - if (!WIFEXITED (status) || WEXITSTATUS (status) != CHILD_STATUS) - { - printf ("Unexpected exit status %d from child process\n", - WEXITSTATUS (status)); - return 1; - } - return 0; -} - -/* Copies the executable into a restricted directory, so that we can - safely make it SGID with the TARGET group ID. Then runs the - executable. */ -static int -run_executable_sgid (gid_t target) -{ - char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", - test_dir, (intmax_t) getpid ()); - char *execname = xasprintf ("%s/bin", dirname); - int infd = -1; - int outfd = -1; - int ret = 0; - if (mkdir (dirname, 0700) < 0) - { - printf ("mkdir: %m\n"); - goto err; - } - infd = open ("/proc/self/exe", O_RDONLY); - if (infd < 0) - { - printf ("open (/proc/self/exe): %m\n"); - goto err; - } - outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); - if (outfd < 0) - { - printf ("open (%s): %m\n", execname); - goto err; - } - char buf[4096]; - for (;;) - { - ssize_t rdcount = read (infd, buf, sizeof (buf)); - if (rdcount < 0) - { - printf ("read: %m\n"); - goto err; - } - if (rdcount == 0) - break; - char *p = buf; - char *end = buf + rdcount; - while (p != end) - { - ssize_t wrcount = write (outfd, buf, end - p); - if (wrcount == 0) - errno = ENOSPC; - if (wrcount <= 0) - { - printf ("write: %m\n"); - goto err; - } - p += wrcount; - } - } - if (fchown (outfd, getuid (), target) < 0) - { - printf ("fchown (%s): %m\n", execname); - goto err; - } - if (fchmod (outfd, 02750) < 0) - { - printf ("fchmod (%s): %m\n", execname); - goto err; - } - if (close (outfd) < 0) - { - printf ("close (outfd): %m\n"); - goto err; - } - if (close (infd) < 0) - { - printf ("close (infd): %m\n"); - goto err; - } - - char *args[] = {execname, SETGID_CHILD, NULL}; - - ret = do_execve (args); - -err: - if (outfd >= 0) - close (outfd); - if (infd >= 0) - close (infd); - if (execname) - { - unlink (execname); - free (execname); - } - if (dirname) - { - rmdir (dirname); - free (dirname); - } - return ret; -} #ifndef test_child static int @@ -256,40 +95,32 @@ do_test (int argc, char **argv) if (argc == 2 && strcmp (argv[1], SETGID_CHILD) == 0) { if (getgid () == getegid ()) - { - /* This can happen if the file system is mounted nosuid. */ - fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n", - (intmax_t) getgid ()); - exit (EXIT_UNSUPPORTED); - } + /* This can happen if the file system is mounted nosuid. */ + FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", + (intmax_t) getgid ()); int ret = test_child (); if (ret != 0) exit (1); - exit (CHILD_STATUS); + exit (EXIT_SUCCESS); } else { if (test_parent () != 0) exit (1); - /* Try running a setgid program. */ - gid_t target = choose_gid (); - if (target == 0) - { - fprintf (stderr, - "Could not find a suitable GID for user %jd, skipping test\n", - (intmax_t) getuid ()); - exit (0); - } + int status = support_capture_subprogram_self_sgid (SETGID_CHILD); - return run_executable_sgid (target); - } + if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) + return EXIT_UNSUPPORTED; + + if (!WIFEXITED (status)) + FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); - /* Something went wrong and our argv was corrupted. */ - _exit (1); + return 0; + } } #define TEST_FUNCTION_ARGV do_test diff --git a/iconv/Versions b/iconv/Versions index 8a5f4cf780..d51af52fa3 100644 --- a/iconv/Versions +++ b/iconv/Versions @@ -6,7 +6,9 @@ libc { GLIBC_PRIVATE { # functions shared with iconv program __gconv_get_alias_db; __gconv_get_cache; __gconv_get_modules_db; - __gconv_open; __gconv_create_spec; + + # functions used elsewhere in glibc + __gconv_open; __gconv_create_spec; __gconv_destroy_spec; # function used by the gconv modules __gconv_transliterate; diff --git a/iconv/gconv_charset.c b/iconv/gconv_charset.c index 6ccd0773cc..4ba0aa99f5 100644 --- a/iconv/gconv_charset.c +++ b/iconv/gconv_charset.c @@ -216,3 +216,13 @@ out: return ret; } libc_hidden_def (__gconv_create_spec) + + +void +__gconv_destroy_spec (struct gconv_spec *conv_spec) +{ + free (conv_spec->fromcode); + free (conv_spec->tocode); + return; +} +libc_hidden_def (__gconv_destroy_spec) diff --git a/iconv/gconv_charset.h b/iconv/gconv_charset.h index b39b09aea1..e9c122cf7e 100644 --- a/iconv/gconv_charset.h +++ b/iconv/gconv_charset.h @@ -48,33 +48,6 @@ #define GCONV_IGNORE_ERRORS_SUFFIX "IGNORE" -/* This function accepts the charset names of the source and destination of the - conversion and populates *conv_spec with an equivalent conversion - specification that may later be used by __gconv_open. The charset names - might contain options in the form of suffixes that alter the conversion, - e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring - and truncating any suffix options in fromcode, and processing and truncating - any suffix options in tocode. Supported suffix options ("TRANSLIT" or - "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec - to be set to true. Unrecognized suffix options are silently discarded. If - the function succeeds, it returns conv_spec back to the caller. It returns - NULL upon failure. */ -struct gconv_spec * -__gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode, - const char *tocode); -libc_hidden_proto (__gconv_create_spec) - - -/* This function frees all heap memory allocated by __gconv_create_spec. */ -static void __attribute__ ((unused)) -gconv_destroy_spec (struct gconv_spec *conv_spec) -{ - free (conv_spec->fromcode); - free (conv_spec->tocode); - return; -} - - /* This function copies in-order, characters from the source 's' that are either alpha-numeric or one in one of these: "_-.,:/" - into the destination 'wp' while dropping all other characters. In the process, it converts all diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h index e86938dae7..f721ce30ff 100644 --- a/iconv/gconv_int.h +++ b/iconv/gconv_int.h @@ -152,6 +152,27 @@ extern int __gconv_open (struct gconv_spec *conv_spec, __gconv_t *handle, int flags); libc_hidden_proto (__gconv_open) +/* This function accepts the charset names of the source and destination of the + conversion and populates *conv_spec with an equivalent conversion + specification that may later be used by __gconv_open. The charset names + might contain options in the form of suffixes that alter the conversion, + e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring + and truncating any suffix options in fromcode, and processing and truncating + any suffix options in tocode. Supported suffix options ("TRANSLIT" or + "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec + to be set to true. Unrecognized suffix options are silently discarded. If + the function succeeds, it returns conv_spec back to the caller. It returns + NULL upon failure. */ +extern struct gconv_spec * +__gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode, + const char *tocode); +libc_hidden_proto (__gconv_create_spec) + +/* This function frees all heap memory allocated by __gconv_create_spec. */ +extern void +__gconv_destroy_spec (struct gconv_spec *conv_spec); +libc_hidden_proto (__gconv_destroy_spec) + /* Free resources associated with transformation descriptor CD. */ extern int __gconv_close (__gconv_t cd) attribute_hidden; diff --git a/iconv/iconv_open.c b/iconv/iconv_open.c index dd54bc12e0..5b30055c04 100644 --- a/iconv/iconv_open.c +++ b/iconv/iconv_open.c @@ -39,7 +39,7 @@ iconv_open (const char *tocode, const char *fromcode) int res = __gconv_open (&conv_spec, &cd, 0); - gconv_destroy_spec (&conv_spec); + __gconv_destroy_spec (&conv_spec); if (__builtin_expect (res, __GCONV_OK) != __GCONV_OK) { diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c index b4334faa57..d59979759c 100644 --- a/iconv/iconv_prog.c +++ b/iconv/iconv_prog.c @@ -184,7 +184,7 @@ main (int argc, char *argv[]) /* Let's see whether we have these coded character sets. */ res = __gconv_open (&conv_spec, &cd, 0); - gconv_destroy_spec (&conv_spec); + __gconv_destroy_spec (&conv_spec); if (res != __GCONV_OK) { diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh index 8298136b7f..d8db7b335c 100644 --- a/iconv/tst-iconv_prog.sh +++ b/iconv/tst-iconv_prog.sh @@ -102,12 +102,16 @@ hangarray=( "\x00\x80;-c;IBM1161;UTF-8//TRANSLIT//IGNORE" "\x00\xdb;-c;IBM1162;UTF-8//TRANSLIT//IGNORE" "\x00\x70;-c;IBM12712;UTF-8//TRANSLIT//IGNORE" -# These are known hangs that are yet to be fixed: -# "\x00\x0f;-c;IBM1364;UTF-8" -# "\x00\x0f;-c;IBM1371;UTF-8" -# "\x00\x0f;-c;IBM1388;UTF-8" -# "\x00\x0f;-c;IBM1390;UTF-8" -# "\x00\x0f;-c;IBM1399;UTF-8" +"\x00\x0f;-c;IBM1364;UTF-8" +"\x0e\x0e;-c;IBM1364;UTF-8" +"\x00\x0f;-c;IBM1371;UTF-8" +"\x0e\x0e;-c;IBM1371;UTF-8" +"\x00\x0f;-c;IBM1388;UTF-8" +"\x0e\x0e;-c;IBM1388;UTF-8" +"\x00\x0f;-c;IBM1390;UTF-8" +"\x0e\x0e;-c;IBM1390;UTF-8" +"\x00\x0f;-c;IBM1399;UTF-8" +"\x0e\x0e;-c;IBM1399;UTF-8" "\x00\x53;-c;IBM16804;UTF-8//TRANSLIT//IGNORE" "\x00\x41;-c;IBM274;UTF-8//TRANSLIT//IGNORE" "\x00\x41;-c;IBM275;UTF-8//TRANSLIT//IGNORE" diff --git a/iconvdata/Makefile b/iconvdata/Makefile index 4ec2741cdc..b67b4feeb4 100644 --- a/iconvdata/Makefile +++ b/iconvdata/Makefile @@ -1,4 +1,5 @@ # Copyright (C) 1997-2020 Free Software Foundation, Inc. +# Copyright (C) The GNU Toolchain Authors. # This file is part of the GNU C Library. # The GNU C Library is free software; you can redistribute it and/or @@ -73,7 +74,8 @@ modules.so := $(addsuffix .so, $(modules)) ifeq (yes,$(build-shared)) tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \ tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \ - bug-iconv10 bug-iconv11 bug-iconv12 tst-iconv-big5-hkscs-to-2ucs4 + bug-iconv10 bug-iconv11 bug-iconv12 tst-iconv-big5-hkscs-to-2ucs4 \ + bug-iconv13 bug-iconv14 bug-iconv15 ifeq ($(have-thread-library),yes) tests += bug-iconv3 endif @@ -321,6 +323,10 @@ $(objpfx)bug-iconv10.out: $(objpfx)gconv-modules \ $(addprefix $(objpfx),$(modules.so)) $(objpfx)bug-iconv12.out: $(objpfx)gconv-modules \ $(addprefix $(objpfx),$(modules.so)) +$(objpfx)bug-iconv14.out: $(objpfx)gconv-modules \ + $(addprefix $(objpfx),$(modules.so)) +$(objpfx)bug-iconv15.out: $(addprefix $(objpfx), $(gconv-modules)) \ + $(addprefix $(objpfx),$(modules.so)) $(objpfx)iconv-test.out: run-iconv-test.sh $(objpfx)gconv-modules \ $(addprefix $(objpfx),$(modules.so)) \ diff --git a/iconvdata/bug-iconv13.c b/iconvdata/bug-iconv13.c new file mode 100644 index 0000000000..87aaff398e --- /dev/null +++ b/iconvdata/bug-iconv13.c @@ -0,0 +1,53 @@ +/* bug 24973: Test EUC-KR module + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +static int +do_test (void) +{ + iconv_t cd = iconv_open ("UTF-8//IGNORE", "EUC-KR"); + TEST_VERIFY_EXIT (cd != (iconv_t) -1); + + /* 0xfe (->0x7e : row 94) and 0xc9 (->0x49 : row 41) are user-defined + areas, which are not allowed and should be skipped over due to + //IGNORE. The trailing 0xfe also is an incomplete sequence, which + should be checked first. */ + char input[4] = { '\xc9', '\xa1', '\0', '\xfe' }; + char *inptr = input; + size_t insize = sizeof (input); + char output[4]; + char *outptr = output; + size_t outsize = sizeof (output); + + /* This used to crash due to buffer overrun. */ + TEST_VERIFY (iconv (cd, &inptr, &insize, &outptr, &outsize) == (size_t) -1); + TEST_VERIFY (errno == EINVAL); + /* The conversion should produce one character, the converted null + character. */ + TEST_VERIFY (sizeof (output) - outsize == 1); + + TEST_VERIFY_EXIT (iconv_close (cd) != -1); + + return 0; +} + +#include diff --git a/iconvdata/bug-iconv14.c b/iconvdata/bug-iconv14.c new file mode 100644 index 0000000000..902f140fa9 --- /dev/null +++ b/iconvdata/bug-iconv14.c @@ -0,0 +1,127 @@ +/* Assertion in ISO-2022-JP-3 due to two-character sequence (bug 27256). + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +/* Use an escape sequence to return to the initial state. */ +static void +with_escape_sequence (void) +{ + iconv_t c = iconv_open ("UTF-8", "ISO-2022-JP-3"); + TEST_VERIFY_EXIT (c != (iconv_t) -1); + + char in[] = "\e$(O+D\e(B"; + char *inbuf = in; + size_t inleft = strlen (in); + char out[3]; /* Space for one output character. */ + char *outbuf; + size_t outleft; + + outbuf = out; + outleft = sizeof (out); + TEST_COMPARE (iconv (c, &inbuf, &inleft, &outbuf, &outleft), (size_t) -1); + TEST_COMPARE (errno, E2BIG); + TEST_COMPARE (inleft, 3); + TEST_COMPARE (inbuf - in, strlen (in) - 3); + TEST_COMPARE (outleft, sizeof (out) - 2); + TEST_COMPARE (outbuf - out, 2); + TEST_COMPARE (out[0] & 0xff, 0xc3); + TEST_COMPARE (out[1] & 0xff, 0xa6); + + /* Return to the initial shift state, producing the pending + character. */ + outbuf = out; + outleft = sizeof (out); + TEST_COMPARE (iconv (c, &inbuf, &inleft, &outbuf, &outleft), 0); + TEST_COMPARE (inleft, 0); + TEST_COMPARE (inbuf - in, strlen (in)); + TEST_COMPARE (outleft, sizeof (out) - 2); + TEST_COMPARE (outbuf - out, 2); + TEST_COMPARE (out[0] & 0xff, 0xcc); + TEST_COMPARE (out[1] & 0xff, 0x80); + + /* Nothing should be flushed the second time. */ + outbuf = out; + outleft = sizeof (out); + TEST_COMPARE (iconv (c, NULL, 0, &outbuf, &outleft), 0); + TEST_COMPARE (outleft, sizeof (out)); + TEST_COMPARE (outbuf - out, 0); + TEST_COMPARE (out[0] & 0xff, 0xcc); + TEST_COMPARE (out[1] & 0xff, 0x80); + + TEST_COMPARE (iconv_close (c), 0); +} + +/* Use an explicit flush to return to the initial state. */ +static void +with_flush (void) +{ + iconv_t c = iconv_open ("UTF-8", "ISO-2022-JP-3"); + TEST_VERIFY_EXIT (c != (iconv_t) -1); + + char in[] = "\e$(O+D"; + char *inbuf = in; + size_t inleft = strlen (in); + char out[3]; /* Space for one output character. */ + char *outbuf; + size_t outleft; + + outbuf = out; + outleft = sizeof (out); + TEST_COMPARE (iconv (c, &inbuf, &inleft, &outbuf, &outleft), (size_t) -1); + TEST_COMPARE (errno, E2BIG); + TEST_COMPARE (inleft, 0); + TEST_COMPARE (inbuf - in, strlen (in)); + TEST_COMPARE (outleft, sizeof (out) - 2); + TEST_COMPARE (outbuf - out, 2); + TEST_COMPARE (out[0] & 0xff, 0xc3); + TEST_COMPARE (out[1] & 0xff, 0xa6); + + /* Flush the pending character. */ + outbuf = out; + outleft = sizeof (out); + TEST_COMPARE (iconv (c, NULL, 0, &outbuf, &outleft), 0); + TEST_COMPARE (outleft, sizeof (out) - 2); + TEST_COMPARE (outbuf - out, 2); + TEST_COMPARE (out[0] & 0xff, 0xcc); + TEST_COMPARE (out[1] & 0xff, 0x80); + + /* Nothing should be flushed the second time. */ + outbuf = out; + outleft = sizeof (out); + TEST_COMPARE (iconv (c, NULL, 0, &outbuf, &outleft), 0); + TEST_COMPARE (outleft, sizeof (out)); + TEST_COMPARE (outbuf - out, 0); + TEST_COMPARE (out[0] & 0xff, 0xcc); + TEST_COMPARE (out[1] & 0xff, 0x80); + + TEST_COMPARE (iconv_close (c), 0); +} + +static int +do_test (void) +{ + with_escape_sequence (); + with_flush (); + return 0; +} + +#include diff --git a/iconvdata/bug-iconv15.c b/iconvdata/bug-iconv15.c new file mode 100644 index 0000000000..cc04bd0313 --- /dev/null +++ b/iconvdata/bug-iconv15.c @@ -0,0 +1,60 @@ +/* Bug 28524: Conversion from ISO-2022-JP-3 with iconv + may emit spurious NUL character on state reset. + Copyright (C) The GNU Toolchain Authors. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +static int +do_test (void) +{ + char in[] = "\x1b(I"; + char *inbuf = in; + size_t inleft = sizeof (in) - 1; + char out[1]; + char *outbuf = out; + size_t outleft = sizeof (out); + iconv_t cd; + + cd = iconv_open ("UTF8", "ISO-2022-JP-3"); + TEST_VERIFY_EXIT (cd != (iconv_t) -1); + + /* First call to iconv should alter internal state. + Now, JISX0201_Kana_set is selected and + state value != ASCII_set. */ + TEST_VERIFY (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1); + + /* No bytes should have been added to + the output buffer at this point. */ + TEST_VERIFY (outbuf == out); + TEST_VERIFY (outleft == sizeof (out)); + + /* Second call shall emit spurious NUL character in unpatched glibc. */ + TEST_VERIFY (iconv (cd, NULL, NULL, &outbuf, &outleft) != (size_t) -1); + + /* No characters are expected to be produced. */ + TEST_VERIFY (outbuf == out); + TEST_VERIFY (outleft == sizeof (out)); + + TEST_VERIFY_EXIT (iconv_close (cd) != -1); + + return 0; +} + +#include diff --git a/iconvdata/euc-kr.c b/iconvdata/euc-kr.c index b0d56cf3ee..1045bae926 100644 --- a/iconvdata/euc-kr.c +++ b/iconvdata/euc-kr.c @@ -80,11 +80,7 @@ euckr_from_ucs4 (uint32_t ch, unsigned char *cp) \ if (ch <= 0x9f) \ ++inptr; \ - /* 0xfe(->0x7e : row 94) and 0xc9(->0x59 : row 41) are \ - user-defined areas. */ \ - else if (__builtin_expect (ch == 0xa0, 0) \ - || __builtin_expect (ch > 0xfe, 0) \ - || __builtin_expect (ch == 0xc9, 0)) \ + else if (__glibc_unlikely (ch == 0xa0)) \ { \ /* This is illegal. */ \ STANDARD_FROM_LOOP_ERR_HANDLER (1); \ diff --git a/iconvdata/ibm1364.c b/iconvdata/ibm1364.c index 49e7267ab4..521f0825b7 100644 --- a/iconvdata/ibm1364.c +++ b/iconvdata/ibm1364.c @@ -158,24 +158,14 @@ enum \ if (__builtin_expect (ch, 0) == SO) \ { \ - /* Shift OUT, change to DBCS converter. */ \ - if (curcs == db) \ - { \ - result = __GCONV_ILLEGAL_INPUT; \ - break; \ - } \ + /* Shift OUT, change to DBCS converter (redundant escape okay). */ \ curcs = db; \ ++inptr; \ continue; \ } \ if (__builtin_expect (ch, 0) == SI) \ { \ - /* Shift IN, change to SBCS converter. */ \ - if (curcs == sb) \ - { \ - result = __GCONV_ILLEGAL_INPUT; \ - break; \ - } \ + /* Shift IN, change to SBCS converter (redundant escape okay). */ \ curcs = sb; \ ++inptr; \ continue; \ diff --git a/iconvdata/iso-2022-jp-3.c b/iconvdata/iso-2022-jp-3.c index 8c3b7e627e..c7b470db61 100644 --- a/iconvdata/iso-2022-jp-3.c +++ b/iconvdata/iso-2022-jp-3.c @@ -1,5 +1,6 @@ /* Conversion module for ISO-2022-JP-3. Copyright (C) 1998-2020 Free Software Foundation, Inc. + Copyright (C) The GNU Toolchain Authors. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1998, and Bruno Haible , 2002. @@ -67,10 +68,15 @@ enum CURRENT_SEL_MASK = 7 << 3 }; -/* During UCS-4 to ISO-2022-JP-3 conversion, the COUNT element of the state - also contains the last two bytes to be output, shifted by 6 bits, and a - one-bit indicator whether they must be preceded by the shift sequence, - in bit 22. */ +/* During UCS-4 to ISO-2022-JP-3 conversion, the COUNT element of the + state also contains the last two bytes to be output, shifted by 6 + bits, and a one-bit indicator whether they must be preceded by the + shift sequence, in bit 22. During ISO-2022-JP-3 to UCS-4 + conversion, COUNT may also contain a non-zero pending wide + character, shifted by six bits. This happens for certain inputs in + JISX0213_1_2004_set and JISX0213_2_set if the second wide character + in a combining sequence cannot be written because the buffer is + full. */ /* Since this is a stateful encoding we have to provide code which resets the output state to the initial state. This has to be done during the @@ -80,10 +86,27 @@ enum { \ if (FROM_DIRECTION) \ { \ - /* It's easy, we don't have to emit anything, we just reset the \ - state for the input. */ \ - data->__statep->__count &= 7; \ - data->__statep->__count |= ASCII_set; \ + uint32_t ch = data->__statep->__count >> 6; \ + \ + if (__glibc_unlikely (ch != 0)) \ + { \ + if (__glibc_likely (outbuf + 4 <= outend)) \ + { \ + /* Write out the last character. */ \ + put32u (outbuf, ch); \ + outbuf += 4; \ + data->__statep->__count &= 7; \ + data->__statep->__count |= ASCII_set; \ + } \ + else \ + /* We don't have enough room in the output buffer. */ \ + status = __GCONV_FULL_OUTPUT; \ + } \ + else \ + { \ + data->__statep->__count &= 7; \ + data->__statep->__count |= ASCII_set; \ + } \ } \ else \ { \ @@ -151,7 +174,21 @@ enum #define LOOPFCT FROM_LOOP #define BODY \ { \ - uint32_t ch = *inptr; \ + uint32_t ch; \ + \ + /* Output any pending character. */ \ + ch = set >> 6; \ + if (__glibc_unlikely (ch != 0)) \ + { \ + put32 (outptr, ch); \ + outptr += 4; \ + /* Remove the pending character, but preserve state bits. */ \ + set &= (1 << 6) - 1; \ + continue; \ + } \ + \ + /* Otherwise read the next input byte. */ \ + ch = *inptr; \ \ /* Recognize escape sequences. */ \ if (__glibc_unlikely (ch == ESC)) \ @@ -297,21 +334,25 @@ enum uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0]; \ uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1]; \ \ + inptr += 2; \ + \ + put32 (outptr, u1); \ + outptr += 4; \ + \ /* See whether we have room for two characters. */ \ - if (outptr + 8 <= outend) \ + if (outptr + 4 <= outend) \ { \ - inptr += 2; \ - put32 (outptr, u1); \ - outptr += 4; \ put32 (outptr, u2); \ outptr += 4; \ continue; \ } \ - else \ - { \ - result = __GCONV_FULL_OUTPUT; \ - break; \ - } \ + \ + /* Otherwise store only the first character now, and \ + put the second one into the queue. */ \ + set |= u2 << 6; \ + /* Tell the caller why we terminate the loop. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ } \ \ inptr += 2; \ diff --git a/iconvdata/ksc5601.h b/iconvdata/ksc5601.h index d3eb3a4ff8..f5cdc72797 100644 --- a/iconvdata/ksc5601.h +++ b/iconvdata/ksc5601.h @@ -50,15 +50,15 @@ ksc5601_to_ucs4 (const unsigned char **s, size_t avail, unsigned char offset) unsigned char ch2; int idx; + if (avail < 2) + return 0; + /* row 94(0x7e) and row 41(0x49) are user-defined area in KS C 5601 */ if (ch < offset || (ch - offset) <= 0x20 || (ch - offset) >= 0x7e || (ch - offset) == 0x49) return __UNKNOWN_10646_CHAR; - if (avail < 2) - return 0; - ch2 = (*s)[1]; if (ch2 < offset || (ch2 - offset) <= 0x20 || (ch2 - offset) >= 0x7f) return __UNKNOWN_10646_CHAR; diff --git a/intl/dcigettext.c b/intl/dcigettext.c index 2e7c662bc7..bd332e71da 100644 --- a/intl/dcigettext.c +++ b/intl/dcigettext.c @@ -1120,15 +1120,18 @@ _nl_find_msg (struct loaded_l10nfile *domain_file, # ifdef _LIBC - struct gconv_spec conv_spec - = { .fromcode = norm_add_slashes (charset, ""), - .tocode = norm_add_slashes (outcharset, ""), - /* We always want to use transliteration. */ - .translit = true, - .ignore = false - }; + struct gconv_spec conv_spec; + + __gconv_create_spec (&conv_spec, charset, outcharset); + + /* We always want to use transliteration. */ + conv_spec.translit = true; + int r = __gconv_open (&conv_spec, &convd->conv, GCONV_AVOID_NOCONV); + + __gconv_destroy_spec (&conv_spec); + if (__builtin_expect (r != __GCONV_OK, 0)) { /* If the output encoding is the same there is diff --git a/intl/tst-codeset.c b/intl/tst-codeset.c index fd70432eca..e9f6e5e09f 100644 --- a/intl/tst-codeset.c +++ b/intl/tst-codeset.c @@ -22,13 +22,11 @@ #include #include #include +#include static int do_test (void) { - char *s; - int result = 0; - unsetenv ("LANGUAGE"); unsetenv ("OUTPUT_CHARSET"); setlocale (LC_ALL, "de_DE.ISO-8859-1"); @@ -36,25 +34,21 @@ do_test (void) bindtextdomain ("codeset", OBJPFX "domaindir"); /* Here we expect output in ISO-8859-1. */ - s = gettext ("cheese"); - if (strcmp (s, "K\344se")) - { - printf ("call 1 returned: %s\n", s); - result = 1; - } + TEST_COMPARE_STRING (gettext ("cheese"), "K\344se"); + /* Here we expect output in UTF-8. */ bind_textdomain_codeset ("codeset", "UTF-8"); + TEST_COMPARE_STRING (gettext ("cheese"), "K\303\244se"); - /* Here we expect output in UTF-8. */ - s = gettext ("cheese"); - if (strcmp (s, "K\303\244se")) - { - printf ("call 2 returned: %s\n", s); - result = 1; - } - - return result; + /* `a with umlaut' is transliterated to `ae'. */ + bind_textdomain_codeset ("codeset", "ASCII//TRANSLIT"); + TEST_COMPARE_STRING (gettext ("cheese"), "Kaese"); + + /* Transliteration also works by default even if not set. */ + bind_textdomain_codeset ("codeset", "ASCII"); + TEST_COMPARE_STRING (gettext ("cheese"), "Kaese"); + + return 0; } -#define TEST_FUNCTION do_test () -#include "../test-skeleton.c" +#include diff --git a/malloc/Makefile b/malloc/Makefile index e22cbde22d..5093e8730e 100644 --- a/malloc/Makefile +++ b/malloc/Makefile @@ -62,6 +62,16 @@ endif tests += $(tests-static) test-srcs = tst-mtrace +# These tests either are run with MALLOC_CHECK_=3 by default or do not work +# with MALLOC_CHECK_=3 because they expect a specific failure. +tests-exclude-mcheck = tst-mcheck tst-malloc-usable \ + tst-interpose-nothread tst-interpose-static-nothread \ + tst-interpose-static-thread tst-malloc-too-large \ + tst-mxfast tst-safe-linking + +# Run all tests with MALLOC_CHECK_=3 +tests-mcheck = $(filter-out $(tests-exclude-mcheck),$(tests)) + routines = malloc morecore mcheck mtrace obstack reallocarray \ scratch_buffer_grow scratch_buffer_grow_preserve \ scratch_buffer_set_array_size \ @@ -100,6 +110,11 @@ $(objpfx)tst-malloc-thread-exit: $(shared-thread-library) $(objpfx)tst-malloc-thread-fail: $(shared-thread-library) $(objpfx)tst-malloc-fork-deadlock: $(shared-thread-library) $(objpfx)tst-malloc-stats-cancellation: $(shared-thread-library) +$(objpfx)tst-malloc-backtrace-mcheck: $(shared-thread-library) +$(objpfx)tst-malloc-thread-exit-mcheck: $(shared-thread-library) +$(objpfx)tst-malloc-thread-fail-mcheck: $(shared-thread-library) +$(objpfx)tst-malloc-fork-deadlock-mcheck: $(shared-thread-library) +$(objpfx)tst-malloc-stats-cancellation-mcheck: $(shared-thread-library) # Export the __malloc_initialize_hook variable to libc.so. LDFLAGS-tst-mallocstate = -rdynamic @@ -239,6 +254,8 @@ $(tests:%=$(objpfx)%.o): CPPFLAGS += -DTEST_NO_MALLOPT $(objpfx)tst-interpose-nothread: $(objpfx)tst-interpose-aux-nothread.o $(objpfx)tst-interpose-thread: \ $(objpfx)tst-interpose-aux-thread.o $(shared-thread-library) +$(objpfx)tst-interpose-thread-mcheck: \ + $(objpfx)tst-interpose-aux-thread.o $(shared-thread-library) $(objpfx)tst-interpose-static-nothread: $(objpfx)tst-interpose-aux-nothread.o $(objpfx)tst-interpose-static-thread: \ $(objpfx)tst-interpose-aux-thread.o $(static-thread-library) @@ -256,3 +273,6 @@ $(objpfx)tst-dynarray-fail-mem.out: $(objpfx)tst-dynarray-fail.out $(objpfx)tst-malloc-tcache-leak: $(shared-thread-library) $(objpfx)tst-malloc_info: $(shared-thread-library) $(objpfx)tst-mallocfork2: $(shared-thread-library) +$(objpfx)tst-malloc-tcache-leak-mcheck: $(shared-thread-library) +$(objpfx)tst-malloc_info-mcheck: $(shared-thread-library) +$(objpfx)tst-mallocfork2-mcheck: $(shared-thread-library) diff --git a/manual/tunables.texi b/manual/tunables.texi index 23ef0d40e7..d72d7a5ec0 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -432,7 +432,11 @@ set shared cache size in bytes for use in memory and string routines. @deftp Tunable glibc.cpu.x86_non_temporal_threshold The @code{glibc.cpu.x86_non_temporal_threshold} tunable allows the user -to set threshold in bytes for non temporal store. +to set threshold in bytes for non temporal store. Non temporal stores +give a hint to the hardware to move data directly to memory without +displacing other data from the cache. This tunable is used by some +platforms to determine when to use non temporal stores in operations +like memmove and memcpy. This tunable is specific to i386 and x86-64. @end deftp diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h index 19d9cc5cfe..38221d0b2a 100644 --- a/misc/sys/cdefs.h +++ b/misc/sys/cdefs.h @@ -124,13 +124,10 @@ #define __bos0(ptr) __builtin_object_size (ptr, 0) #if __GNUC_PREREQ (4,3) -# define __warndecl(name, msg) \ - extern void name (void) __attribute__((__warning__ (msg))) # define __warnattr(msg) __attribute__((__warning__ (msg))) # define __errordecl(name, msg) \ extern void name (void) __attribute__((__error__ (msg))) #else -# define __warndecl(name, msg) extern void name (void) # define __warnattr(msg) # define __errordecl(name, msg) extern void name (void) #endif diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c index 2cba3da38c..c217cda608 100644 --- a/nptl/pthread_create.c +++ b/nptl/pthread_create.c @@ -416,8 +416,6 @@ START_THREAD_DEFN unwind_buf.priv.data.prev = NULL; unwind_buf.priv.data.cleanup = NULL; - __libc_signal_restore_set (&pd->sigmask); - /* Allow setxid from now onwards. */ if (__glibc_unlikely (atomic_exchange_acq (&pd->setxid_futex, 0) == -2)) futex_wake (&pd->setxid_futex, 1, FUTEX_PRIVATE); @@ -427,6 +425,8 @@ START_THREAD_DEFN /* Store the new cleanup handler info. */ THREAD_SETMEM (pd, cleanup_jmp_buf, &unwind_buf); + __libc_signal_restore_set (&pd->sigmask); + /* We are either in (a) or (b), and in either case we either own PD already (2) or are about to own PD (1), and so our only restriction would be that we can't free PD until we know we diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c index 88c69d1e9c..381aa721ef 100644 --- a/nscd/netgroupcache.c +++ b/nscd/netgroupcache.c @@ -248,7 +248,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, : NULL); ndomain = (ndomain ? newbuf + ndomaindiff : NULL); - buffer = newbuf; + *tofreep = buffer = newbuf; } nhost = memcpy (buffer + bufused, @@ -319,7 +319,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req, else if (status == NSS_STATUS_TRYAGAIN && e == ERANGE) { buflen *= 2; - buffer = xrealloc (buffer, buflen); + *tofreep = buffer = xrealloc (buffer, buflen); } else if (status == NSS_STATUS_RETURN || status == NSS_STATUS_NOTFOUND diff --git a/nss/tst-nss-files-hosts-long.root/etc/nsswitch.conf b/nss/tst-nss-files-hosts-long.root/etc/nsswitch.conf new file mode 100644 index 0000000000..5b0c6a4199 --- /dev/null +++ b/nss/tst-nss-files-hosts-long.root/etc/nsswitch.conf @@ -0,0 +1 @@ +hosts: files diff --git a/posix/bits/unistd.h b/posix/bits/unistd.h index 725a83eb0d..7e5bb6fb1e 100644 --- a/posix/bits/unistd.h +++ b/posix/bits/unistd.h @@ -193,10 +193,9 @@ __NTH (readlinkat (int __fd, const char *__restrict __path, #endif extern char *__getcwd_chk (char *__buf, size_t __size, size_t __buflen) - __THROW __wur __attr_access ((__write_only__, 1, 2)); + __THROW __wur; extern char *__REDIRECT_NTH (__getcwd_alias, - (char *__buf, size_t __size), getcwd) - __wur __attr_access ((__write_only__, 1, 2)); + (char *__buf, size_t __size), getcwd) __wur; extern char *__REDIRECT_NTH (__getcwd_chk_warn, (char *__buf, size_t __size, size_t __buflen), __getcwd_chk) diff --git a/posix/unistd.h b/posix/unistd.h index 32b8161619..acf9ee7e79 100644 --- a/posix/unistd.h +++ b/posix/unistd.h @@ -517,8 +517,7 @@ extern int fchdir (int __fd) __THROW __wur; an array is allocated with `malloc'; the array is SIZE bytes long, unless SIZE == 0, in which case it is as big as necessary. */ -extern char *getcwd (char *__buf, size_t __size) __THROW __wur - __attr_access ((__write_only__, 1, 2)); +extern char *getcwd (char *__buf, size_t __size) __THROW __wur; #ifdef __USE_GNU /* Return a malloc'd string containing the current directory name. @@ -831,7 +830,7 @@ extern int symlinkat (const char *__from, int __tofd, /* Like readlink but a relative PATH is interpreted relative to FD. */ extern ssize_t readlinkat (int __fd, const char *__restrict __path, char *__restrict __buf, size_t __len) - __THROW __nonnull ((2, 3)) __wur __attr_access ((__read_only__, 3, 4)); + __THROW __nonnull ((2, 3)) __wur __attr_access ((__write_only__, 3, 4)); #endif /* Remove the link NAME. */ diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c index ed1b22308e..cb3f989cba 100644 --- a/posix/wordexp-test.c +++ b/posix/wordexp-test.c @@ -183,6 +183,7 @@ struct test_case_struct { 0, NULL, "$var", 0, 0, { NULL, }, IFS }, { 0, NULL, "\"\\n\"", 0, 1, { "\\n", }, IFS }, { 0, NULL, "", 0, 0, { NULL, }, IFS }, + { 0, NULL, "${1234567890123456789012}", 0, 0, { NULL, }, IFS }, /* Flags not already covered (testit() has special handling for these) */ { 0, NULL, "one two", WRDE_DOOFFS, 2, { "one", "two", }, IFS }, diff --git a/posix/wordexp.c b/posix/wordexp.c index e082d94895..56289503a1 100644 --- a/posix/wordexp.c +++ b/posix/wordexp.c @@ -1399,7 +1399,7 @@ envsubst: /* Is it a numeric parameter? */ else if (isdigit (env[0])) { - int n = atoi (env); + unsigned long n = strtoul (env, NULL, 10); if (n >= __libc_argc) /* Substitute NULL. */ diff --git a/resolv/Makefile b/resolv/Makefile index b61c0c3e0c..dbd8f8bf4f 100644 --- a/resolv/Makefile +++ b/resolv/Makefile @@ -61,6 +61,11 @@ tests += \ tst-resolv-search \ tst-resolv-trailing \ +# This test calls __res_context_send directly, which is not exported +# from libresolv. +tests-internal += tst-resolv-txnid-collision +tests-static += tst-resolv-txnid-collision + # These tests need libdl. ifeq (yes,$(build-shared)) tests += \ @@ -191,6 +196,8 @@ $(objpfx)tst-resolv-search: $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-trailing: $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-threads: \ $(libdl) $(objpfx)libresolv.so $(shared-thread-library) +$(objpfx)tst-resolv-txnid-collision: $(objpfx)libresolv.a \ + $(static-thread-library) $(objpfx)tst-resolv-canonname: \ $(libdl) $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-trustad: $(objpfx)libresolv.so $(shared-thread-library) diff --git a/resolv/res_send.c b/resolv/res_send.c index 7e5fec6646..70e5066031 100644 --- a/resolv/res_send.c +++ b/resolv/res_send.c @@ -1342,15 +1342,6 @@ send_dg(res_state statp, *terrno = EMSGSIZE; return close_and_return_error (statp, resplen2); } - if ((recvresp1 || hp->id != anhp->id) - && (recvresp2 || hp2->id != anhp->id)) { - /* - * response from old query, ignore it. - * XXX - potential security hazard could - * be detected here. - */ - goto wait; - } /* Paranoia check. Due to the connected UDP socket, the kernel has already filtered invalid addresses @@ -1360,15 +1351,24 @@ send_dg(res_state statp, /* Check for the correct header layout and a matching question. */ - if ((recvresp1 || !res_queriesmatch(buf, buf + buflen, - *thisansp, - *thisansp - + *thisanssizp)) - && (recvresp2 || !res_queriesmatch(buf2, buf2 + buflen2, - *thisansp, - *thisansp - + *thisanssizp))) - goto wait; + int matching_query = 0; /* Default to no matching query. */ + if (!recvresp1 + && anhp->id == hp->id + && res_queriesmatch (buf, buf + buflen, + *thisansp, *thisansp + *thisanssizp)) + matching_query = 1; + if (!recvresp2 + && anhp->id == hp2->id + && res_queriesmatch (buf2, buf2 + buflen2, + *thisansp, *thisansp + *thisanssizp)) + matching_query = 2; + if (matching_query == 0) + /* Spurious UDP packet. Drop it and continue + waiting. */ + { + need_recompute = 1; + goto wait; + } if (anhp->rcode == SERVFAIL || anhp->rcode == NOTIMP || @@ -1383,7 +1383,7 @@ send_dg(res_state statp, /* No data from the first reply. */ resplen = 0; /* We are waiting for a possible second reply. */ - if (hp->id == anhp->id) + if (matching_query == 1) recvresp1 = 1; else recvresp2 = 1; @@ -1414,7 +1414,7 @@ send_dg(res_state statp, return (1); } /* Mark which reply we received. */ - if (recvresp1 == 0 && hp->id == anhp->id) + if (matching_query == 1) recvresp1 = 1; else recvresp2 = 1; diff --git a/resolv/tst-resolv-txnid-collision.c b/resolv/tst-resolv-txnid-collision.c new file mode 100644 index 0000000000..189b76f126 --- /dev/null +++ b/resolv/tst-resolv-txnid-collision.c @@ -0,0 +1,334 @@ +/* Test parallel queries with transaction ID collisions. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Result of parsing a DNS question name. + + A question name has the form reorder-N-M-rcode-C.example.net, where + N and M are either 0 and 1, corresponding to the reorder member, + and C is a number that will be stored in the rcode field. + + Also see parse_qname below. */ +struct parsed_qname +{ + /* The DNS response code requested from the first server. The + second server always responds with RCODE zero. */ + int rcode; + + /* Indicates whether to perform reordering in the responses from the + respective server. */ + bool reorder[2]; +}; + +/* Fills *PARSED based on QNAME. */ +static void +parse_qname (struct parsed_qname *parsed, const char *qname) +{ + int reorder0; + int reorder1; + int rcode; + char *suffix; + if (sscanf (qname, "reorder-%d-%d.rcode-%d.%ms", + &reorder0, &reorder1, &rcode, &suffix) == 4) + { + if (reorder0 != 0) + TEST_COMPARE (reorder0, 1); + if (reorder1 != 0) + TEST_COMPARE (reorder1, 1); + TEST_VERIFY (rcode >= 0 && rcode <= 15); + TEST_COMPARE_STRING (suffix, "example.net"); + free (suffix); + + parsed->rcode = rcode; + parsed->reorder[0] = reorder0; + parsed->reorder[1] = reorder1; + } + else + FAIL_EXIT1 ("unexpected query: %s", qname); +} + +/* Used to construct a response. The first server responds with an + error, the second server succeeds. */ +static void +build_response (const struct resolv_response_context *ctx, + struct resolv_response_builder *b, + const char *qname, uint16_t qclass, uint16_t qtype) +{ + struct parsed_qname parsed; + parse_qname (&parsed, qname); + + switch (ctx->server_index) + { + case 0: + { + struct resolv_response_flags flags = { 0 }; + if (parsed.rcode == 0) + /* Simulate a delegation in case a NODATA (RCODE zero) + response is requested. */ + flags.clear_ra = true; + else + flags.rcode = parsed.rcode; + + resolv_response_init (b, flags); + resolv_response_add_question (b, qname, qclass, qtype); + } + break; + + case 1: + { + struct resolv_response_flags flags = { 0, }; + resolv_response_init (b, flags); + resolv_response_add_question (b, qname, qclass, qtype); + + resolv_response_section (b, ns_s_an); + resolv_response_open_record (b, qname, qclass, qtype, 0); + if (qtype == T_A) + { + char ipv4[4] = { 192, 0, 2, 1 }; + resolv_response_add_data (b, &ipv4, sizeof (ipv4)); + } + else + { + char ipv6[16] + = { 0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; + resolv_response_add_data (b, &ipv6, sizeof (ipv6)); + } + resolv_response_close_record (b); + } + break; + } +} + +/* Used to reorder responses. */ +struct resolv_response_context *previous_query; + +/* Used to keep track of the queries received. */ +static int previous_server_index = -1; +static uint16_t previous_qtype; + +/* For each server, buffer the first query and then send both answers + to the second query, reordered if requested. */ +static void +response (const struct resolv_response_context *ctx, + struct resolv_response_builder *b, + const char *qname, uint16_t qclass, uint16_t qtype) +{ + TEST_VERIFY (qtype == T_A || qtype == T_AAAA); + if (ctx->server_index != 0) + TEST_COMPARE (ctx->server_index, 1); + + struct parsed_qname parsed; + parse_qname (&parsed, qname); + + if (previous_query == NULL) + { + /* No buffered query. Record this query and do not send a + response. */ + TEST_COMPARE (previous_qtype, 0); + previous_query = resolv_response_context_duplicate (ctx); + previous_qtype = qtype; + resolv_response_drop (b); + previous_server_index = ctx->server_index; + + if (test_verbose) + printf ("info: buffering first query for: %s\n", qname); + } + else + { + TEST_VERIFY (previous_query != 0); + TEST_COMPARE (ctx->server_index, previous_server_index); + TEST_VERIFY (previous_qtype != qtype); /* Not a duplicate. */ + + /* If reordering, send a response for this query explicitly, and + then skip the implicit send. */ + if (parsed.reorder[ctx->server_index]) + { + if (test_verbose) + printf ("info: sending reordered second response for: %s\n", + qname); + build_response (ctx, b, qname, qclass, qtype); + resolv_response_send_udp (ctx, b); + resolv_response_drop (b); + } + + /* Build a response for the previous query and send it, thus + reordering the two responses. */ + { + if (test_verbose) + printf ("info: sending first response for: %s\n", qname); + struct resolv_response_builder *btmp + = resolv_response_builder_allocate (previous_query->query_buffer, + previous_query->query_length); + build_response (ctx, btmp, qname, qclass, previous_qtype); + resolv_response_send_udp (ctx, btmp); + resolv_response_builder_free (btmp); + } + + /* If not reordering, send the reply as usual. */ + if (!parsed.reorder[ctx->server_index]) + { + if (test_verbose) + printf ("info: sending non-reordered second response for: %s\n", + qname); + build_response (ctx, b, qname, qclass, qtype); + } + + /* Unbuffer the response and prepare for the next query. */ + resolv_response_context_free (previous_query); + previous_query = NULL; + previous_qtype = 0; + previous_server_index = -1; + } +} + +/* Runs a query for QNAME and checks for the expected reply. See + struct parsed_qname for the expected format for QNAME. */ +static void +test_qname (const char *qname, int rcode) +{ + struct resolv_context *ctx = __resolv_context_get (); + TEST_VERIFY_EXIT (ctx != NULL); + + unsigned char q1[512]; + int q1len = res_mkquery (QUERY, qname, C_IN, T_A, NULL, 0, NULL, + q1, sizeof (q1)); + TEST_VERIFY_EXIT (q1len > 12); + + unsigned char q2[512]; + int q2len = res_mkquery (QUERY, qname, C_IN, T_AAAA, NULL, 0, NULL, + q2, sizeof (q2)); + TEST_VERIFY_EXIT (q2len > 12); + + /* Produce a transaction ID collision. */ + memcpy (q2, q1, 2); + + unsigned char ans1[512]; + unsigned char *ans1p = ans1; + unsigned char *ans2p = NULL; + int nans2p = 0; + int resplen2 = 0; + int ans2p_malloced = 0; + + /* Perform a parallel A/AAAA query. */ + int resplen1 = __res_context_send (ctx, q1, q1len, q2, q2len, + ans1, sizeof (ans1), &ans1p, + &ans2p, &nans2p, + &resplen2, &ans2p_malloced); + + TEST_VERIFY (resplen1 > 12); + TEST_VERIFY (resplen2 > 12); + if (resplen1 <= 12 || resplen2 <= 12) + return; + + if (rcode == 1 || rcode == 3) + { + /* Format Error and Name Error responses does not trigger + switching to the next server. */ + TEST_COMPARE (ans1p[3] & 0x0f, rcode); + TEST_COMPARE (ans2p[3] & 0x0f, rcode); + return; + } + + /* The response should be successful. */ + TEST_COMPARE (ans1p[3] & 0x0f, 0); + TEST_COMPARE (ans2p[3] & 0x0f, 0); + + /* Due to bug 19691, the answer may not be in the slot matching the + query. Assume that the AAAA response is the longer one. */ + unsigned char *a_answer; + int a_answer_length; + unsigned char *aaaa_answer; + int aaaa_answer_length; + if (resplen2 > resplen1) + { + a_answer = ans1p; + a_answer_length = resplen1; + aaaa_answer = ans2p; + aaaa_answer_length = resplen2; + } + else + { + a_answer = ans2p; + a_answer_length = resplen2; + aaaa_answer = ans1p; + aaaa_answer_length = resplen1; + } + + { + char *expected = xasprintf ("name: %s\n" + "address: 192.0.2.1\n", + qname); + check_dns_packet (qname, a_answer, a_answer_length, expected); + free (expected); + } + { + char *expected = xasprintf ("name: %s\n" + "address: 2001:db8::1\n", + qname); + check_dns_packet (qname, aaaa_answer, aaaa_answer_length, expected); + free (expected); + } + + if (ans2p_malloced) + free (ans2p); + + __resolv_context_put (ctx); +} + +static int +do_test (void) +{ + struct resolv_test *aux = resolv_test_start + ((struct resolv_redirect_config) + { + .response_callback = response, + + /* The response callback use global state (the previous_* + variables), and query processing must therefore be + serialized. */ + .single_thread_udp = true, + }); + + for (int rcode = 0; rcode <= 5; ++rcode) + for (int do_reorder_0 = 0; do_reorder_0 < 2; ++do_reorder_0) + for (int do_reorder_1 = 0; do_reorder_1 < 2; ++do_reorder_1) + { + char *qname = xasprintf ("reorder-%d-%d.rcode-%d.example.net", + do_reorder_0, do_reorder_1, rcode); + test_qname (qname, rcode); + free (qname); + } + + resolv_test_end (aux); + + return 0; +} + +#include diff --git a/rt/Makefile b/rt/Makefile index dab5d62a57..93502cfaa7 100644 --- a/rt/Makefile +++ b/rt/Makefile @@ -44,6 +44,7 @@ tests := tst-shm tst-timer tst-timer2 \ tst-aio7 tst-aio8 tst-aio9 tst-aio10 \ tst-mqueue1 tst-mqueue2 tst-mqueue3 tst-mqueue4 \ tst-mqueue5 tst-mqueue6 tst-mqueue7 tst-mqueue8 tst-mqueue9 \ + tst-bz28213 \ tst-timer3 tst-timer4 tst-timer5 \ tst-cpuclock2 tst-cputimer1 tst-cputimer2 tst-cputimer3 \ tst-shm-cancel diff --git a/rt/tst-bz28213.c b/rt/tst-bz28213.c new file mode 100644 index 0000000000..0c096b5a0a --- /dev/null +++ b/rt/tst-bz28213.c @@ -0,0 +1,101 @@ +/* Bug 28213: test for NULL pointer dereference in mq_notify. + Copyright (C) The GNU Toolchain Authors. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static mqd_t m = -1; +static const char msg[] = "hello"; + +static void +check_bz28213_cb (union sigval sv) +{ + char buf[sizeof (msg)]; + + (void) sv; + + TEST_VERIFY_EXIT ((size_t) mq_receive (m, buf, sizeof (buf), NULL) + == sizeof (buf)); + TEST_VERIFY_EXIT (memcmp (buf, msg, sizeof (buf)) == 0); + + exit (0); +} + +static void +check_bz28213 (void) +{ + struct sigevent sev; + + memset (&sev, '\0', sizeof (sev)); + sev.sigev_notify = SIGEV_THREAD; + sev.sigev_notify_function = check_bz28213_cb; + + /* Step 1: Register & unregister notifier. + Helper thread should receive NOTIFY_REMOVED notification. + In a vulnerable version of glibc, NULL pointer dereference follows. */ + TEST_VERIFY_EXIT (mq_notify (m, &sev) == 0); + TEST_VERIFY_EXIT (mq_notify (m, NULL) == 0); + + /* Step 2: Once again, register notification. + Try to send one message. + Test is considered successful, if the callback does exit (0). */ + TEST_VERIFY_EXIT (mq_notify (m, &sev) == 0); + TEST_VERIFY_EXIT (mq_send (m, msg, sizeof (msg), 1) == 0); + + /* Wait... */ + pause (); +} + +static int +do_test (void) +{ + static const char m_name[] = "/bz28213_queue"; + struct mq_attr m_attr; + + memset (&m_attr, '\0', sizeof (m_attr)); + m_attr.mq_maxmsg = 1; + m_attr.mq_msgsize = sizeof (msg); + + m = mq_open (m_name, + O_RDWR | O_CREAT | O_EXCL, + 0600, + &m_attr); + + if (m < 0) + { + if (errno == ENOSYS) + FAIL_UNSUPPORTED ("POSIX message queues are not implemented\n"); + FAIL_EXIT1 ("Failed to create POSIX message queue: %m\n"); + } + + TEST_VERIFY_EXIT (mq_unlink (m_name) == 0); + + check_bz28213 (); + + return 0; +} + +#include diff --git a/stdio-common/Makefile b/stdio-common/Makefile index 8475fd1f09..eff0c98d82 100644 --- a/stdio-common/Makefile +++ b/stdio-common/Makefile @@ -69,7 +69,8 @@ tests := tstscanf test_rdwr test-popen tstgetln test-fseek \ tst-printf-bz25691 \ tst-vfprintf-width-prec-alloc \ tst-printf-fp-free \ - tst-printf-fp-leak + tst-printf-fp-leak \ + test-strerr test-srcs = tst-unbputc tst-printf tst-printfsz-islongdouble diff --git a/stdio-common/errlist.c b/stdio-common/errlist.c index d15f13a22a..2ecf121674 100644 --- a/stdio-common/errlist.c +++ b/stdio-common/errlist.c @@ -20,9 +20,13 @@ #include #include +#ifndef ERR_MAP +# define ERR_MAP(n) n +#endif + const char *const _sys_errlist_internal[] = { -#define _S(n, str) [n] = str, +#define _S(n, str) [ERR_MAP(n)] = str, #include #undef _S }; @@ -41,20 +45,21 @@ static const union sys_errname_t { #define MSGSTRFIELD1(line) str##line #define MSGSTRFIELD(line) MSGSTRFIELD1(line) -#define _S(n, str) char MSGSTRFIELD(__LINE__)[sizeof(str)]; +#define _S(n, str) char MSGSTRFIELD(__LINE__)[sizeof(#n)]; #include #undef _S }; char str[0]; } _sys_errname = { { -#define _S(n, s) s, +#define _S(n, s) #n, #include #undef _S } }; static const unsigned short _sys_errnameidx[] = { -#define _S(n, s) [n] = offsetof(union sys_errname_t, MSGSTRFIELD(__LINE__)), +#define _S(n, s) \ + [ERR_MAP(n)] = offsetof(union sys_errname_t, MSGSTRFIELD(__LINE__)), #include #undef _S }; diff --git a/stdio-common/test-strerr.c b/stdio-common/test-strerr.c index fded208118..d77b81d507 100644 --- a/stdio-common/test-strerr.c +++ b/stdio-common/test-strerr.c @@ -18,46 +18,672 @@ #include #include -#include #include #include -#define N_(name) name - -static const char *const errlist[] = - { -/* This file is auto-generated from errlist.def. */ -#include - }; - -#define MSGSTR_T errname_t -#define MSGSTR errname -#define MSGIDX errnameidx -#include -#undef MSGSTR -#undef MSGIDX - static int do_test (void) { - TEST_VERIFY (strerrordesc_np (-1) == NULL); - TEST_VERIFY (strerrordesc_np (array_length (errlist)) == NULL); - for (size_t i = 0; i < array_length (errlist); i++) - { - if (errlist[i] == NULL) - continue; - TEST_COMPARE_STRING (strerrordesc_np (i), errlist[i]); - } + TEST_COMPARE_STRING (strerrordesc_np (0), "Success"); + TEST_COMPARE_STRING (strerrorname_np (0), "0"); - TEST_VERIFY (strerrorname_np (-1) == NULL); - TEST_VERIFY (strerrorname_np (array_length (errlist)) == NULL); - for (size_t i = 0; i < array_length (errlist); i++) - { - if (errlist[i] == NULL) - continue; - TEST_COMPARE_STRING (strerrorname_np (i), errname.str + errnameidx[i]); - } +#ifdef EPERM + TEST_COMPARE_STRING (strerrordesc_np (EPERM), "Operation not permitted"); + TEST_COMPARE_STRING (strerrorname_np (EPERM), "EPERM"); +#endif +#ifdef ENOENT + TEST_COMPARE_STRING (strerrordesc_np (ENOENT), + "No such file or directory"); + TEST_COMPARE_STRING (strerrorname_np (ENOENT), "ENOENT"); +#endif +#ifdef ESRCH + TEST_COMPARE_STRING (strerrordesc_np (ESRCH), "No such process"); + TEST_COMPARE_STRING (strerrorname_np (ESRCH), "ESRCH"); +#endif +#ifdef EINTR + TEST_COMPARE_STRING (strerrordesc_np (EINTR), "Interrupted system call"); + TEST_COMPARE_STRING (strerrorname_np (EINTR), "EINTR"); +#endif +#ifdef EIO + TEST_COMPARE_STRING (strerrordesc_np (EIO), "Input/output error"); + TEST_COMPARE_STRING (strerrorname_np (EIO), "EIO"); +#endif +#ifdef ENXIO + TEST_COMPARE_STRING (strerrordesc_np (ENXIO), "No such device or address"); + TEST_COMPARE_STRING (strerrorname_np (ENXIO), "ENXIO"); +#endif +#ifdef E2BIG + TEST_COMPARE_STRING (strerrordesc_np (E2BIG), "Argument list too long"); + TEST_COMPARE_STRING (strerrorname_np (E2BIG), "E2BIG"); +#endif +#ifdef ENOEXEC + TEST_COMPARE_STRING (strerrordesc_np (ENOEXEC), "Exec format error"); + TEST_COMPARE_STRING (strerrorname_np (ENOEXEC), "ENOEXEC"); +#endif +#ifdef EBADF + TEST_COMPARE_STRING (strerrordesc_np (EBADF), "Bad file descriptor"); + TEST_COMPARE_STRING (strerrorname_np (EBADF), "EBADF"); +#endif +#ifdef ECHILD + TEST_COMPARE_STRING (strerrordesc_np (ECHILD), "No child processes"); + TEST_COMPARE_STRING (strerrorname_np (ECHILD), "ECHILD"); +#endif +#ifdef EDEADLK + TEST_COMPARE_STRING (strerrordesc_np (EDEADLK), + "Resource deadlock avoided"); + TEST_COMPARE_STRING (strerrorname_np (EDEADLK), "EDEADLK"); +#endif +#ifdef ENOMEM + TEST_COMPARE_STRING (strerrordesc_np (ENOMEM), "Cannot allocate memory"); + TEST_COMPARE_STRING (strerrorname_np (ENOMEM), "ENOMEM"); +#endif +#ifdef EACCES + TEST_COMPARE_STRING (strerrordesc_np (EACCES), "Permission denied"); + TEST_COMPARE_STRING (strerrorname_np (EACCES), "EACCES"); +#endif +#ifdef EFAULT + TEST_COMPARE_STRING (strerrordesc_np (EFAULT), "Bad address"); + TEST_COMPARE_STRING (strerrorname_np (EFAULT), "EFAULT"); +#endif +#ifdef ENOTBLK + TEST_COMPARE_STRING (strerrordesc_np (ENOTBLK), "Block device required"); + TEST_COMPARE_STRING (strerrorname_np (ENOTBLK), "ENOTBLK"); +#endif +#ifdef EBUSY + TEST_COMPARE_STRING (strerrordesc_np (EBUSY), "Device or resource busy"); + TEST_COMPARE_STRING (strerrorname_np (EBUSY), "EBUSY"); +#endif +#ifdef EEXIST + TEST_COMPARE_STRING (strerrordesc_np (EEXIST), "File exists"); + TEST_COMPARE_STRING (strerrorname_np (EEXIST), "EEXIST"); +#endif +#ifdef EXDEV + TEST_COMPARE_STRING (strerrordesc_np (EXDEV), "Invalid cross-device link"); + TEST_COMPARE_STRING (strerrorname_np (EXDEV), "EXDEV"); +#endif +#ifdef ENODEV + TEST_COMPARE_STRING (strerrordesc_np (ENODEV), "No such device"); + TEST_COMPARE_STRING (strerrorname_np (ENODEV), "ENODEV"); +#endif +#ifdef ENOTDIR + TEST_COMPARE_STRING (strerrordesc_np (ENOTDIR), "Not a directory"); + TEST_COMPARE_STRING (strerrorname_np (ENOTDIR), "ENOTDIR"); +#endif +#ifdef EISDIR + TEST_COMPARE_STRING (strerrordesc_np (EISDIR), "Is a directory"); + TEST_COMPARE_STRING (strerrorname_np (EISDIR), "EISDIR"); +#endif +#ifdef EINVAL + TEST_COMPARE_STRING (strerrordesc_np (EINVAL), "Invalid argument"); + TEST_COMPARE_STRING (strerrorname_np (EINVAL), "EINVAL"); +#endif +#ifdef EMFILE + TEST_COMPARE_STRING (strerrordesc_np (EMFILE), "Too many open files"); + TEST_COMPARE_STRING (strerrorname_np (EMFILE), "EMFILE"); +#endif +#ifdef ENFILE + TEST_COMPARE_STRING (strerrordesc_np (ENFILE), + "Too many open files in system"); + TEST_COMPARE_STRING (strerrorname_np (ENFILE), "ENFILE"); +#endif +#ifdef ENOTTY + TEST_COMPARE_STRING (strerrordesc_np (ENOTTY), + "Inappropriate ioctl for device"); + TEST_COMPARE_STRING (strerrorname_np (ENOTTY), "ENOTTY"); +#endif +#ifdef ETXTBSY + TEST_COMPARE_STRING (strerrordesc_np (ETXTBSY), "Text file busy"); + TEST_COMPARE_STRING (strerrorname_np (ETXTBSY), "ETXTBSY"); +#endif +#ifdef EFBIG + TEST_COMPARE_STRING (strerrordesc_np (EFBIG), "File too large"); + TEST_COMPARE_STRING (strerrorname_np (EFBIG), "EFBIG"); +#endif +#ifdef ENOSPC + TEST_COMPARE_STRING (strerrordesc_np (ENOSPC), "No space left on device"); + TEST_COMPARE_STRING (strerrorname_np (ENOSPC), "ENOSPC"); +#endif +#ifdef ESPIPE + TEST_COMPARE_STRING (strerrordesc_np (ESPIPE), "Illegal seek"); + TEST_COMPARE_STRING (strerrorname_np (ESPIPE), "ESPIPE"); +#endif +#ifdef EROFS + TEST_COMPARE_STRING (strerrordesc_np (EROFS), "Read-only file system"); + TEST_COMPARE_STRING (strerrorname_np (EROFS), "EROFS"); +#endif +#ifdef EMLINK + TEST_COMPARE_STRING (strerrordesc_np (EMLINK), "Too many links"); + TEST_COMPARE_STRING (strerrorname_np (EMLINK), "EMLINK"); +#endif +#ifdef EPIPE + TEST_COMPARE_STRING (strerrordesc_np (EPIPE), "Broken pipe"); + TEST_COMPARE_STRING (strerrorname_np (EPIPE), "EPIPE"); +#endif +#ifdef EDOM + TEST_COMPARE_STRING (strerrordesc_np (EDOM), + "Numerical argument out of domain"); + TEST_COMPARE_STRING (strerrorname_np (EDOM), "EDOM"); +#endif +#ifdef ERANGE + TEST_COMPARE_STRING (strerrordesc_np (ERANGE), + "Numerical result out of range"); + TEST_COMPARE_STRING (strerrorname_np (ERANGE), "ERANGE"); +#endif +#ifdef EAGAIN + TEST_COMPARE_STRING (strerrordesc_np (EAGAIN), + "Resource temporarily unavailable"); + TEST_COMPARE_STRING (strerrorname_np (EAGAIN), "EAGAIN"); +#endif +#ifdef EINPROGRESS + TEST_COMPARE_STRING (strerrordesc_np (EINPROGRESS), + "Operation now in progress"); + TEST_COMPARE_STRING (strerrorname_np (EINPROGRESS), "EINPROGRESS"); +#endif +#ifdef EALREADY + TEST_COMPARE_STRING (strerrordesc_np (EALREADY), + "Operation already in progress"); + TEST_COMPARE_STRING (strerrorname_np (EALREADY), "EALREADY"); +#endif +#ifdef ENOTSOCK + TEST_COMPARE_STRING (strerrordesc_np (ENOTSOCK), + "Socket operation on non-socket"); + TEST_COMPARE_STRING (strerrorname_np (ENOTSOCK), "ENOTSOCK"); +#endif +#ifdef EMSGSIZE + TEST_COMPARE_STRING (strerrordesc_np (EMSGSIZE), "Message too long"); + TEST_COMPARE_STRING (strerrorname_np (EMSGSIZE), "EMSGSIZE"); +#endif +#ifdef EPROTOTYPE + TEST_COMPARE_STRING (strerrordesc_np (EPROTOTYPE), + "Protocol wrong type for socket"); + TEST_COMPARE_STRING (strerrorname_np (EPROTOTYPE), "EPROTOTYPE"); +#endif +#ifdef ENOPROTOOPT + TEST_COMPARE_STRING (strerrordesc_np (ENOPROTOOPT), + "Protocol not available"); + TEST_COMPARE_STRING (strerrorname_np (ENOPROTOOPT), "ENOPROTOOPT"); +#endif +#ifdef EPROTONOSUPPORT + TEST_COMPARE_STRING (strerrordesc_np (EPROTONOSUPPORT), + "Protocol not supported"); + TEST_COMPARE_STRING (strerrorname_np (EPROTONOSUPPORT), "EPROTONOSUPPORT"); +#endif +#ifdef ESOCKTNOSUPPORT + TEST_COMPARE_STRING (strerrordesc_np (ESOCKTNOSUPPORT), + "Socket type not supported"); + TEST_COMPARE_STRING (strerrorname_np (ESOCKTNOSUPPORT), "ESOCKTNOSUPPORT"); +#endif +#ifdef EOPNOTSUPP + TEST_COMPARE_STRING (strerrordesc_np (EOPNOTSUPP), + "Operation not supported"); + TEST_COMPARE_STRING (strerrorname_np (EOPNOTSUPP), "EOPNOTSUPP"); +#endif +#ifdef EPFNOSUPPORT + TEST_COMPARE_STRING (strerrordesc_np (EPFNOSUPPORT), + "Protocol family not supported"); + TEST_COMPARE_STRING (strerrorname_np (EPFNOSUPPORT), "EPFNOSUPPORT"); +#endif +#ifdef EAFNOSUPPORT + TEST_COMPARE_STRING (strerrordesc_np (EAFNOSUPPORT), + "Address family not supported by protocol"); + TEST_COMPARE_STRING (strerrorname_np (EAFNOSUPPORT), "EAFNOSUPPORT"); +#endif +#ifdef EADDRINUSE + TEST_COMPARE_STRING (strerrordesc_np (EADDRINUSE), + "Address already in use"); + TEST_COMPARE_STRING (strerrorname_np (EADDRINUSE), "EADDRINUSE"); +#endif +#ifdef EADDRNOTAVAIL + TEST_COMPARE_STRING (strerrordesc_np (EADDRNOTAVAIL), + "Cannot assign requested address"); + TEST_COMPARE_STRING (strerrorname_np (EADDRNOTAVAIL), "EADDRNOTAVAIL"); +#endif +#ifdef ENETDOWN + TEST_COMPARE_STRING (strerrordesc_np (ENETDOWN), "Network is down"); + TEST_COMPARE_STRING (strerrorname_np (ENETDOWN), "ENETDOWN"); +#endif +#ifdef ENETUNREACH + TEST_COMPARE_STRING (strerrordesc_np (ENETUNREACH), + "Network is unreachable"); + TEST_COMPARE_STRING (strerrorname_np (ENETUNREACH), "ENETUNREACH"); +#endif +#ifdef ENETRESET + TEST_COMPARE_STRING (strerrordesc_np (ENETRESET), + "Network dropped connection on reset"); + TEST_COMPARE_STRING (strerrorname_np (ENETRESET), "ENETRESET"); +#endif +#ifdef ECONNABORTED + TEST_COMPARE_STRING (strerrordesc_np (ECONNABORTED), + "Software caused connection abort"); + TEST_COMPARE_STRING (strerrorname_np (ECONNABORTED), "ECONNABORTED"); +#endif +#ifdef ECONNRESET + TEST_COMPARE_STRING (strerrordesc_np (ECONNRESET), + "Connection reset by peer"); + TEST_COMPARE_STRING (strerrorname_np (ECONNRESET), "ECONNRESET"); +#endif +#ifdef ENOBUFS + TEST_COMPARE_STRING (strerrordesc_np (ENOBUFS), + "No buffer space available"); + TEST_COMPARE_STRING (strerrorname_np (ENOBUFS), "ENOBUFS"); +#endif +#ifdef EISCONN + TEST_COMPARE_STRING (strerrordesc_np (EISCONN), + "Transport endpoint is already connected"); + TEST_COMPARE_STRING (strerrorname_np (EISCONN), "EISCONN"); +#endif +#ifdef ENOTCONN + TEST_COMPARE_STRING (strerrordesc_np (ENOTCONN), + "Transport endpoint is not connected"); + TEST_COMPARE_STRING (strerrorname_np (ENOTCONN), "ENOTCONN"); +#endif +#ifdef EDESTADDRREQ + TEST_COMPARE_STRING (strerrordesc_np (EDESTADDRREQ), + "Destination address required"); + TEST_COMPARE_STRING (strerrorname_np (EDESTADDRREQ), "EDESTADDRREQ"); +#endif +#ifdef ESHUTDOWN + TEST_COMPARE_STRING (strerrordesc_np (ESHUTDOWN), + "Cannot send after transport endpoint shutdown"); + TEST_COMPARE_STRING (strerrorname_np (ESHUTDOWN), "ESHUTDOWN"); +#endif +#ifdef ETOOMANYREFS + TEST_COMPARE_STRING (strerrordesc_np (ETOOMANYREFS), + "Too many references: cannot splice"); + TEST_COMPARE_STRING (strerrorname_np (ETOOMANYREFS), "ETOOMANYREFS"); +#endif +#ifdef ETIMEDOUT + TEST_COMPARE_STRING (strerrordesc_np (ETIMEDOUT), "Connection timed out"); + TEST_COMPARE_STRING (strerrorname_np (ETIMEDOUT), "ETIMEDOUT"); +#endif +#ifdef ECONNREFUSED + TEST_COMPARE_STRING (strerrordesc_np (ECONNREFUSED), "Connection refused"); + TEST_COMPARE_STRING (strerrorname_np (ECONNREFUSED), "ECONNREFUSED"); +#endif +#ifdef ELOOP + TEST_COMPARE_STRING (strerrordesc_np (ELOOP), + "Too many levels of symbolic links"); + TEST_COMPARE_STRING (strerrorname_np (ELOOP), "ELOOP"); +#endif +#ifdef ENAMETOOLONG + TEST_COMPARE_STRING (strerrordesc_np (ENAMETOOLONG), "File name too long"); + TEST_COMPARE_STRING (strerrorname_np (ENAMETOOLONG), "ENAMETOOLONG"); +#endif +#ifdef EHOSTDOWN + TEST_COMPARE_STRING (strerrordesc_np (EHOSTDOWN), "Host is down"); + TEST_COMPARE_STRING (strerrorname_np (EHOSTDOWN), "EHOSTDOWN"); +#endif +#ifdef EHOSTUNREACH + TEST_COMPARE_STRING (strerrordesc_np (EHOSTUNREACH), "No route to host"); + TEST_COMPARE_STRING (strerrorname_np (EHOSTUNREACH), "EHOSTUNREACH"); +#endif +#ifdef ENOTEMPTY + TEST_COMPARE_STRING (strerrordesc_np (ENOTEMPTY), "Directory not empty"); + TEST_COMPARE_STRING (strerrorname_np (ENOTEMPTY), "ENOTEMPTY"); +#endif +#ifdef EUSERS + TEST_COMPARE_STRING (strerrordesc_np (EUSERS), "Too many users"); + TEST_COMPARE_STRING (strerrorname_np (EUSERS), "EUSERS"); +#endif +#ifdef EDQUOT + TEST_COMPARE_STRING (strerrordesc_np (EDQUOT), "Disk quota exceeded"); + TEST_COMPARE_STRING (strerrorname_np (EDQUOT), "EDQUOT"); +#endif +#ifdef ESTALE + TEST_COMPARE_STRING (strerrordesc_np (ESTALE), "Stale file handle"); + TEST_COMPARE_STRING (strerrorname_np (ESTALE), "ESTALE"); +#endif +#ifdef EREMOTE + TEST_COMPARE_STRING (strerrordesc_np (EREMOTE), "Object is remote"); + TEST_COMPARE_STRING (strerrorname_np (EREMOTE), "EREMOTE"); +#endif +#ifdef ENOLCK + TEST_COMPARE_STRING (strerrordesc_np (ENOLCK), "No locks available"); + TEST_COMPARE_STRING (strerrorname_np (ENOLCK), "ENOLCK"); +#endif +#ifdef ENOSYS + TEST_COMPARE_STRING (strerrordesc_np (ENOSYS), "Function not implemented"); + TEST_COMPARE_STRING (strerrorname_np (ENOSYS), "ENOSYS"); +#endif +#ifdef EILSEQ + TEST_COMPARE_STRING (strerrordesc_np (EILSEQ), + "Invalid or incomplete multibyte or wide character"); + TEST_COMPARE_STRING (strerrorname_np (EILSEQ), "EILSEQ"); +#endif +#ifdef EBADMSG + TEST_COMPARE_STRING (strerrordesc_np (EBADMSG), "Bad message"); + TEST_COMPARE_STRING (strerrorname_np (EBADMSG), "EBADMSG"); +#endif +#ifdef EIDRM + TEST_COMPARE_STRING (strerrordesc_np (EIDRM), "Identifier removed"); + TEST_COMPARE_STRING (strerrorname_np (EIDRM), "EIDRM"); +#endif +#ifdef EMULTIHOP + TEST_COMPARE_STRING (strerrordesc_np (EMULTIHOP), "Multihop attempted"); + TEST_COMPARE_STRING (strerrorname_np (EMULTIHOP), "EMULTIHOP"); +#endif +#ifdef ENODATA + TEST_COMPARE_STRING (strerrordesc_np (ENODATA), "No data available"); + TEST_COMPARE_STRING (strerrorname_np (ENODATA), "ENODATA"); +#endif +#ifdef ENOLINK + TEST_COMPARE_STRING (strerrordesc_np (ENOLINK), "Link has been severed"); + TEST_COMPARE_STRING (strerrorname_np (ENOLINK), "ENOLINK"); +#endif +#ifdef ENOMSG + TEST_COMPARE_STRING (strerrordesc_np (ENOMSG), + "No message of desired type"); + TEST_COMPARE_STRING (strerrorname_np (ENOMSG), "ENOMSG"); +#endif +#ifdef ENOSR + TEST_COMPARE_STRING (strerrordesc_np (ENOSR), "Out of streams resources"); + TEST_COMPARE_STRING (strerrorname_np (ENOSR), "ENOSR"); +#endif +#ifdef ENOSTR + TEST_COMPARE_STRING (strerrordesc_np (ENOSTR), "Device not a stream"); + TEST_COMPARE_STRING (strerrorname_np (ENOSTR), "ENOSTR"); +#endif +#ifdef EOVERFLOW + TEST_COMPARE_STRING (strerrordesc_np (EOVERFLOW), + "Value too large for defined data type"); + TEST_COMPARE_STRING (strerrorname_np (EOVERFLOW), "EOVERFLOW"); +#endif +#ifdef EPROTO + TEST_COMPARE_STRING (strerrordesc_np (EPROTO), "Protocol error"); + TEST_COMPARE_STRING (strerrorname_np (EPROTO), "EPROTO"); +#endif +#ifdef ETIME + TEST_COMPARE_STRING (strerrordesc_np (ETIME), "Timer expired"); + TEST_COMPARE_STRING (strerrorname_np (ETIME), "ETIME"); +#endif +#ifdef ECANCELED + TEST_COMPARE_STRING (strerrordesc_np (ECANCELED), "Operation canceled"); + TEST_COMPARE_STRING (strerrorname_np (ECANCELED), "ECANCELED"); +#endif +#ifdef EOWNERDEAD + TEST_COMPARE_STRING (strerrordesc_np (EOWNERDEAD), "Owner died"); + TEST_COMPARE_STRING (strerrorname_np (EOWNERDEAD), "EOWNERDEAD"); +#endif +#ifdef ENOTRECOVERABLE + TEST_COMPARE_STRING (strerrordesc_np (ENOTRECOVERABLE), + "State not recoverable"); + TEST_COMPARE_STRING (strerrorname_np (ENOTRECOVERABLE), "ENOTRECOVERABLE"); +#endif +#ifdef ERESTART + TEST_COMPARE_STRING (strerrordesc_np (ERESTART), + "Interrupted system call should be restarted"); + TEST_COMPARE_STRING (strerrorname_np (ERESTART), "ERESTART"); +#endif +#ifdef ECHRNG + TEST_COMPARE_STRING (strerrordesc_np (ECHRNG), + "Channel number out of range"); + TEST_COMPARE_STRING (strerrorname_np (ECHRNG), "ECHRNG"); +#endif +#ifdef EL2NSYNC + TEST_COMPARE_STRING (strerrordesc_np (EL2NSYNC), + "Level 2 not synchronized"); + TEST_COMPARE_STRING (strerrorname_np (EL2NSYNC), "EL2NSYNC"); +#endif +#ifdef EL3HLT + TEST_COMPARE_STRING (strerrordesc_np (EL3HLT), "Level 3 halted"); + TEST_COMPARE_STRING (strerrorname_np (EL3HLT), "EL3HLT"); +#endif +#ifdef EL3RST + TEST_COMPARE_STRING (strerrordesc_np (EL3RST), "Level 3 reset"); + TEST_COMPARE_STRING (strerrorname_np (EL3RST), "EL3RST"); +#endif +#ifdef ELNRNG + TEST_COMPARE_STRING (strerrordesc_np (ELNRNG), "Link number out of range"); + TEST_COMPARE_STRING (strerrorname_np (ELNRNG), "ELNRNG"); +#endif +#ifdef EUNATCH + TEST_COMPARE_STRING (strerrordesc_np (EUNATCH), + "Protocol driver not attached"); + TEST_COMPARE_STRING (strerrorname_np (EUNATCH), "EUNATCH"); +#endif +#ifdef ENOCSI + TEST_COMPARE_STRING (strerrordesc_np (ENOCSI), + "No CSI structure available"); + TEST_COMPARE_STRING (strerrorname_np (ENOCSI), "ENOCSI"); +#endif +#ifdef EL2HLT + TEST_COMPARE_STRING (strerrordesc_np (EL2HLT), "Level 2 halted"); + TEST_COMPARE_STRING (strerrorname_np (EL2HLT), "EL2HLT"); +#endif +#ifdef EBADE + TEST_COMPARE_STRING (strerrordesc_np (EBADE), "Invalid exchange"); + TEST_COMPARE_STRING (strerrorname_np (EBADE), "EBADE"); +#endif +#ifdef EBADR + TEST_COMPARE_STRING (strerrordesc_np (EBADR), + "Invalid request descriptor"); + TEST_COMPARE_STRING (strerrorname_np (EBADR), "EBADR"); +#endif +#ifdef EXFULL + TEST_COMPARE_STRING (strerrordesc_np (EXFULL), "Exchange full"); + TEST_COMPARE_STRING (strerrorname_np (EXFULL), "EXFULL"); +#endif +#ifdef ENOANO + TEST_COMPARE_STRING (strerrordesc_np (ENOANO), "No anode"); + TEST_COMPARE_STRING (strerrorname_np (ENOANO), "ENOANO"); +#endif +#ifdef EBADRQC + TEST_COMPARE_STRING (strerrordesc_np (EBADRQC), "Invalid request code"); + TEST_COMPARE_STRING (strerrorname_np (EBADRQC), "EBADRQC"); +#endif +#ifdef EBADSLT + TEST_COMPARE_STRING (strerrordesc_np (EBADSLT), "Invalid slot"); + TEST_COMPARE_STRING (strerrorname_np (EBADSLT), "EBADSLT"); +#endif +#ifdef EBFONT + TEST_COMPARE_STRING (strerrordesc_np (EBFONT), "Bad font file format"); + TEST_COMPARE_STRING (strerrorname_np (EBFONT), "EBFONT"); +#endif +#ifdef ENONET + TEST_COMPARE_STRING (strerrordesc_np (ENONET), + "Machine is not on the network"); + TEST_COMPARE_STRING (strerrorname_np (ENONET), "ENONET"); +#endif +#ifdef ENOPKG + TEST_COMPARE_STRING (strerrordesc_np (ENOPKG), "Package not installed"); + TEST_COMPARE_STRING (strerrorname_np (ENOPKG), "ENOPKG"); +#endif +#ifdef EADV + TEST_COMPARE_STRING (strerrordesc_np (EADV), "Advertise error"); + TEST_COMPARE_STRING (strerrorname_np (EADV), "EADV"); +#endif +#ifdef ESRMNT + TEST_COMPARE_STRING (strerrordesc_np (ESRMNT), "Srmount error"); + TEST_COMPARE_STRING (strerrorname_np (ESRMNT), "ESRMNT"); +#endif +#ifdef ECOMM + TEST_COMPARE_STRING (strerrordesc_np (ECOMM), + "Communication error on send"); + TEST_COMPARE_STRING (strerrorname_np (ECOMM), "ECOMM"); +#endif +#ifdef EDOTDOT + TEST_COMPARE_STRING (strerrordesc_np (EDOTDOT), "RFS specific error"); + TEST_COMPARE_STRING (strerrorname_np (EDOTDOT), "EDOTDOT"); +#endif +#ifdef ENOTUNIQ + TEST_COMPARE_STRING (strerrordesc_np (ENOTUNIQ), + "Name not unique on network"); + TEST_COMPARE_STRING (strerrorname_np (ENOTUNIQ), "ENOTUNIQ"); +#endif +#ifdef EBADFD + TEST_COMPARE_STRING (strerrordesc_np (EBADFD), + "File descriptor in bad state"); + TEST_COMPARE_STRING (strerrorname_np (EBADFD), "EBADFD"); +#endif +#ifdef EREMCHG + TEST_COMPARE_STRING (strerrordesc_np (EREMCHG), "Remote address changed"); + TEST_COMPARE_STRING (strerrorname_np (EREMCHG), "EREMCHG"); +#endif +#ifdef ELIBACC + TEST_COMPARE_STRING (strerrordesc_np (ELIBACC), + "Can not access a needed shared library"); + TEST_COMPARE_STRING (strerrorname_np (ELIBACC), "ELIBACC"); +#endif +#ifdef ELIBBAD + TEST_COMPARE_STRING (strerrordesc_np (ELIBBAD), + "Accessing a corrupted shared library"); + TEST_COMPARE_STRING (strerrorname_np (ELIBBAD), "ELIBBAD"); +#endif +#ifdef ELIBSCN + TEST_COMPARE_STRING (strerrordesc_np (ELIBSCN), + ".lib section in a.out corrupted"); + TEST_COMPARE_STRING (strerrorname_np (ELIBSCN), "ELIBSCN"); +#endif +#ifdef ELIBMAX + TEST_COMPARE_STRING (strerrordesc_np (ELIBMAX), + "Attempting to link in too many shared libraries"); + TEST_COMPARE_STRING (strerrorname_np (ELIBMAX), "ELIBMAX"); +#endif +#ifdef ELIBEXEC + TEST_COMPARE_STRING (strerrordesc_np (ELIBEXEC), + "Cannot exec a shared library directly"); + TEST_COMPARE_STRING (strerrorname_np (ELIBEXEC), "ELIBEXEC"); +#endif +#ifdef ESTRPIPE + TEST_COMPARE_STRING (strerrordesc_np (ESTRPIPE), "Streams pipe error"); + TEST_COMPARE_STRING (strerrorname_np (ESTRPIPE), "ESTRPIPE"); +#endif +#ifdef EUCLEAN + TEST_COMPARE_STRING (strerrordesc_np (EUCLEAN), + "Structure needs cleaning"); + TEST_COMPARE_STRING (strerrorname_np (EUCLEAN), "EUCLEAN"); +#endif +#ifdef ENOTNAM + TEST_COMPARE_STRING (strerrordesc_np (ENOTNAM), + "Not a XENIX named type file"); + TEST_COMPARE_STRING (strerrorname_np (ENOTNAM), "ENOTNAM"); +#endif +#ifdef ENAVAIL + TEST_COMPARE_STRING (strerrordesc_np (ENAVAIL), + "No XENIX semaphores available"); + TEST_COMPARE_STRING (strerrorname_np (ENAVAIL), "ENAVAIL"); +#endif +#ifdef EISNAM + TEST_COMPARE_STRING (strerrordesc_np (EISNAM), "Is a named type file"); + TEST_COMPARE_STRING (strerrorname_np (EISNAM), "EISNAM"); +#endif +#ifdef EREMOTEIO + TEST_COMPARE_STRING (strerrordesc_np (EREMOTEIO), "Remote I/O error"); + TEST_COMPARE_STRING (strerrorname_np (EREMOTEIO), "EREMOTEIO"); +#endif +#ifdef ENOMEDIUM + TEST_COMPARE_STRING (strerrordesc_np (ENOMEDIUM), "No medium found"); + TEST_COMPARE_STRING (strerrorname_np (ENOMEDIUM), "ENOMEDIUM"); +#endif +#ifdef EMEDIUMTYPE + TEST_COMPARE_STRING (strerrordesc_np (EMEDIUMTYPE), "Wrong medium type"); + TEST_COMPARE_STRING (strerrorname_np (EMEDIUMTYPE), "EMEDIUMTYPE"); +#endif +#ifdef ENOKEY + TEST_COMPARE_STRING (strerrordesc_np (ENOKEY), + "Required key not available"); + TEST_COMPARE_STRING (strerrorname_np (ENOKEY), "ENOKEY"); +#endif +#ifdef EKEYEXPIRED + TEST_COMPARE_STRING (strerrordesc_np (EKEYEXPIRED), "Key has expired"); + TEST_COMPARE_STRING (strerrorname_np (EKEYEXPIRED), "EKEYEXPIRED"); +#endif +#ifdef EKEYREVOKED + TEST_COMPARE_STRING (strerrordesc_np (EKEYREVOKED), + "Key has been revoked"); + TEST_COMPARE_STRING (strerrorname_np (EKEYREVOKED), "EKEYREVOKED"); +#endif +#ifdef EKEYREJECTED + TEST_COMPARE_STRING (strerrordesc_np (EKEYREJECTED), + "Key was rejected by service"); + TEST_COMPARE_STRING (strerrorname_np (EKEYREJECTED), "EKEYREJECTED"); +#endif +#ifdef ERFKILL + TEST_COMPARE_STRING (strerrordesc_np (ERFKILL), + "Operation not possible due to RF-kill"); + TEST_COMPARE_STRING (strerrorname_np (ERFKILL), "ERFKILL"); +#endif +#ifdef EHWPOISON + TEST_COMPARE_STRING (strerrordesc_np (EHWPOISON), + "Memory page has hardware error"); + TEST_COMPARE_STRING (strerrorname_np (EHWPOISON), "EHWPOISON"); +#endif +#ifdef EBADRPC + TEST_COMPARE_STRING (strerrordesc_np (EBADRPC), "RPC struct is bad"); + TEST_COMPARE_STRING (strerrorname_np (EBADRPC), "EBADRPC"); +#endif +#ifdef EFTYPE + TEST_COMPARE_STRING (strerrordesc_np (EFTYPE), + "Inappropriate file type or format"); + TEST_COMPARE_STRING (strerrorname_np (EFTYPE), "EFTYPE"); +#endif +#ifdef EPROCUNAVAIL + TEST_COMPARE_STRING (strerrordesc_np (EPROCUNAVAIL), + "RPC bad procedure for program"); + TEST_COMPARE_STRING (strerrorname_np (EPROCUNAVAIL), "EPROCUNAVAIL"); +#endif +#ifdef EAUTH + TEST_COMPARE_STRING (strerrordesc_np (EAUTH), "Authentication error"); + TEST_COMPARE_STRING (strerrorname_np (EAUTH), "EAUTH"); +#endif +#ifdef EDIED + TEST_COMPARE_STRING (strerrordesc_np (EDIED), "Translator died"); + TEST_COMPARE_STRING (strerrorname_np (EDIED), "EDIED"); +#endif +#ifdef ERPCMISMATCH + TEST_COMPARE_STRING (strerrordesc_np (ERPCMISMATCH), "RPC version wrong"); + TEST_COMPARE_STRING (strerrorname_np (ERPCMISMATCH), "ERPCMISMATCH"); +#endif +#ifdef EGREGIOUS + TEST_COMPARE_STRING (strerrordesc_np (EGREGIOUS), + "You really blew it this time"); + TEST_COMPARE_STRING (strerrorname_np (EGREGIOUS), "EGREGIOUS"); +#endif +#ifdef EPROCLIM + TEST_COMPARE_STRING (strerrordesc_np (EPROCLIM), "Too many processes"); + TEST_COMPARE_STRING (strerrorname_np (EPROCLIM), "EPROCLIM"); +#endif +#ifdef EGRATUITOUS + TEST_COMPARE_STRING (strerrordesc_np (EGRATUITOUS), "Gratuitous error"); + TEST_COMPARE_STRING (strerrorname_np (EGRATUITOUS), "EGRATUITOUS"); +#endif +#if defined (ENOTSUP) && ENOTSUP != EOPNOTSUPP + TEST_COMPARE_STRING (strerrordesc_np (ENOTSUP), "Not supported"); + TEST_COMPARE_STRING (strerrorname_np (ENOTSUP), "ENOTSUP"); +#endif +#ifdef EPROGMISMATCH + TEST_COMPARE_STRING (strerrordesc_np (EPROGMISMATCH), + "RPC program version wrong"); + TEST_COMPARE_STRING (strerrorname_np (EPROGMISMATCH), "EPROGMISMATCH"); +#endif +#ifdef EBACKGROUND + TEST_COMPARE_STRING (strerrordesc_np (EBACKGROUND), + "Inappropriate operation for background process"); + TEST_COMPARE_STRING (strerrorname_np (EBACKGROUND), "EBACKGROUND"); +#endif +#ifdef EIEIO + TEST_COMPARE_STRING (strerrordesc_np (EIEIO), "Computer bought the farm"); + TEST_COMPARE_STRING (strerrorname_np (EIEIO), "EIEIO"); +#endif +#if defined (EWOULDBLOCK) && EWOULDBLOCK != EAGAIN + TEST_COMPARE_STRING (strerrordesc_np (EWOULDBLOCK), + "Operation would block"); + TEST_COMPARE_STRING (strerrorname_np (EWOULDBLOCK), "EWOULDBLOCK"); +#endif +#ifdef ENEEDAUTH + TEST_COMPARE_STRING (strerrordesc_np (ENEEDAUTH), "Need authenticator"); + TEST_COMPARE_STRING (strerrorname_np (ENEEDAUTH), "ENEEDAUTH"); +#endif +#ifdef ED + TEST_COMPARE_STRING (strerrordesc_np (ED), "?"); + TEST_COMPARE_STRING (strerrorname_np (ED), "ED"); +#endif +#ifdef EPROGUNAVAIL + TEST_COMPARE_STRING (strerrordesc_np (EPROGUNAVAIL), + "RPC program not available"); + TEST_COMPARE_STRING (strerrorname_np (EPROGUNAVAIL), "EPROGUNAVAIL"); +#endif return 0; } diff --git a/stdio-common/vfscanf-internal.c b/stdio-common/vfscanf-internal.c index 95b46dcbeb..3a323547f9 100644 --- a/stdio-common/vfscanf-internal.c +++ b/stdio-common/vfscanf-internal.c @@ -277,7 +277,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr, #endif { va_list arg; - const CHAR_T *f = format; + const UCHAR_T *f = (const UCHAR_T *) format; UCHAR_T fc; /* Current character of the format. */ WINT_T done = 0; /* Assignments done. */ size_t read_in = 0; /* Chars read in. */ @@ -415,10 +415,11 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr, #endif #ifndef COMPILE_WSCANF - if (!isascii ((unsigned char) *f)) + if (!isascii (*f)) { /* Non-ASCII, may be a multibyte. */ - int len = __mbrlen (f, strlen (f), &state); + int len = __mbrlen ((const char *) f, strlen ((const char *) f), + &state); if (len > 0) { do @@ -426,7 +427,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr, c = inchar (); if (__glibc_unlikely (c == EOF)) input_error (); - else if (c != (unsigned char) *f++) + else if (c != *f++) { ungetc_not_eof (c, s); conv_error (); @@ -484,9 +485,9 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr, char_buffer_rewind (&charbuf); /* Check for a positional parameter specification. */ - if (ISDIGIT ((UCHAR_T) *f)) + if (ISDIGIT (*f)) { - argpos = read_int ((const UCHAR_T **) &f); + argpos = read_int (&f); if (*f == L_('$')) ++f; else @@ -521,8 +522,8 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr, /* Find the maximum field width. */ width = 0; - if (ISDIGIT ((UCHAR_T) *f)) - width = read_int ((const UCHAR_T **) &f); + if (ISDIGIT (*f)) + width = read_int (&f); got_width: if (width == 0) width = -1; @@ -2522,12 +2523,11 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr, } while ((fc = *f++) != '\0' && fc != ']') - if (fc == '-' && *f != '\0' && *f != ']' - && (unsigned char) f[-2] <= (unsigned char) *f) + if (fc == '-' && *f != '\0' && *f != ']' && f[-2] <= *f) { /* Add all characters from the one before the '-' up to (but not including) the next format char. */ - for (fc = (unsigned char) f[-2]; fc < (unsigned char) *f; ++fc) + for (fc = f[-2]; fc < *f; ++fc) ((char *)charbuf.scratch.data)[fc] = 1; } else diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c index 3cfe9a05c3..d4b1139c5e 100644 --- a/stdlib/tst-secure-getenv.c +++ b/stdlib/tst-secure-getenv.c @@ -30,167 +30,12 @@ #include #include +#include #include +#include #include static char MAGIC_ARGUMENT[] = "run-actual-test"; -#define MAGIC_STATUS 19 - -/* Return a GID which is not our current GID, but is present in the - supplementary group list. */ -static gid_t -choose_gid (void) -{ - int count = getgroups (0, NULL); - if (count < 0) - { - printf ("getgroups: %m\n"); - exit (1); - } - gid_t *groups; - groups = xcalloc (count, sizeof (*groups)); - int ret = getgroups (count, groups); - if (ret < 0) - { - printf ("getgroups: %m\n"); - exit (1); - } - gid_t current = getgid (); - gid_t not_current = 0; - for (int i = 0; i < ret; ++i) - { - if (groups[i] != current) - { - not_current = groups[i]; - break; - } - } - free (groups); - return not_current; -} - - -/* Copies the executable into a restricted directory, so that we can - safely make it SGID with the TARGET group ID. Then runs the - executable. */ -static int -run_executable_sgid (gid_t target) -{ - char *dirname = xasprintf ("%s/secure-getenv.%jd", - test_dir, (intmax_t) getpid ()); - char *execname = xasprintf ("%s/bin", dirname); - int infd = -1; - int outfd = -1; - int ret = -1; - if (mkdir (dirname, 0700) < 0) - { - printf ("mkdir: %m\n"); - goto err; - } - infd = open ("/proc/self/exe", O_RDONLY); - if (infd < 0) - { - printf ("open (/proc/self/exe): %m\n"); - goto err; - } - outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); - if (outfd < 0) - { - printf ("open (%s): %m\n", execname); - goto err; - } - char buf[4096]; - for (;;) - { - ssize_t rdcount = read (infd, buf, sizeof (buf)); - if (rdcount < 0) - { - printf ("read: %m\n"); - goto err; - } - if (rdcount == 0) - break; - char *p = buf; - char *end = buf + rdcount; - while (p != end) - { - ssize_t wrcount = write (outfd, buf, end - p); - if (wrcount == 0) - errno = ENOSPC; - if (wrcount <= 0) - { - printf ("write: %m\n"); - goto err; - } - p += wrcount; - } - } - if (fchown (outfd, getuid (), target) < 0) - { - printf ("fchown (%s): %m\n", execname); - goto err; - } - if (fchmod (outfd, 02750) < 0) - { - printf ("fchmod (%s): %m\n", execname); - goto err; - } - if (close (outfd) < 0) - { - printf ("close (outfd): %m\n"); - goto err; - } - if (close (infd) < 0) - { - printf ("close (infd): %m\n"); - goto err; - } - - int kid = fork (); - if (kid < 0) - { - printf ("fork: %m\n"); - goto err; - } - if (kid == 0) - { - /* Child process. */ - char *args[] = { execname, MAGIC_ARGUMENT, NULL }; - execve (execname, args, environ); - printf ("execve (%s): %m\n", execname); - _exit (1); - } - int status; - if (waitpid (kid, &status, 0) < 0) - { - printf ("waitpid: %m\n"); - goto err; - } - if (!WIFEXITED (status) || WEXITSTATUS (status) != MAGIC_STATUS) - { - printf ("Unexpected exit status %d from child process\n", - status); - goto err; - } - ret = 0; - -err: - if (outfd >= 0) - close (outfd); - if (infd >= 0) - close (infd); - if (execname) - { - unlink (execname); - free (execname); - } - if (dirname) - { - rmdir (dirname); - free (dirname); - } - return ret; -} static int do_test (void) @@ -212,15 +57,15 @@ do_test (void) exit (1); } - gid_t target = choose_gid (); - if (target == 0) - { - fprintf (stderr, - "Could not find a suitable GID for user %jd, skipping test\n", - (intmax_t) getuid ()); - exit (0); - } - return run_executable_sgid (target); + int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); + + if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) + return EXIT_UNSUPPORTED; + + if (!WIFEXITED (status)) + FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); + + return 0; } static void @@ -229,23 +74,15 @@ alternative_main (int argc, char **argv) if (argc == 2 && strcmp (argv[1], MAGIC_ARGUMENT) == 0) { if (getgid () == getegid ()) - { - /* This can happen if the file system is mounted nosuid. */ - fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n", - (intmax_t) getgid ()); - exit (MAGIC_STATUS); - } + /* This can happen if the file system is mounted nosuid. */ + FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", + (intmax_t) getgid ()); if (getenv ("PATH") == NULL) - { - printf ("PATH variable not present\n"); - exit (3); - } + FAIL_EXIT (3, "PATH variable not present\n"); if (secure_getenv ("PATH") != NULL) - { - printf ("PATH variable not filtered out\n"); - exit (4); - } - exit (MAGIC_STATUS); + FAIL_EXIT (4, "PATH variable not filtered out\n"); + + exit (EXIT_SUCCESS); } } diff --git a/string/bits/string_fortified.h b/string/bits/string_fortified.h index 309d0f39b2..c8d3051af8 100644 --- a/string/bits/string_fortified.h +++ b/string/bits/string_fortified.h @@ -22,11 +22,6 @@ # error "Never use directly; include instead." #endif -#if !__GNUC_PREREQ (5,0) -__warndecl (__warn_memset_zero_len, - "memset used with constant zero length parameter; this could be due to transposed parameters"); -#endif - __fortify_function void * __NTH (memcpy (void *__restrict __dest, const void *__restrict __src, size_t __len)) @@ -58,16 +53,6 @@ __NTH (mempcpy (void *__restrict __dest, const void *__restrict __src, __fortify_function void * __NTH (memset (void *__dest, int __ch, size_t __len)) { - /* GCC-5.0 and newer implements these checks in the compiler, so we don't - need them here. */ -#if !__GNUC_PREREQ (5,0) - if (__builtin_constant_p (__len) && __len == 0 - && (!__builtin_constant_p (__ch) || __ch != 0)) - { - __warn_memset_zero_len (); - return __dest; - } -#endif return __builtin___memset_chk (__dest, __ch, __len, __bos0 (__dest)); } diff --git a/string/test-memchr.c b/string/test-memchr.c index 5dd0aa5470..de70e794d9 100644 --- a/string/test-memchr.c +++ b/string/test-memchr.c @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res) CHAR *res = CALL (impl, s, c, n); if (res != exp_res) { - error (0, 0, "Wrong result in function %s %p %p", impl->name, - res, exp_res); + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p", + impl->name, s, c, n, res, exp_res); ret = 1; return; } @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char) } buf[align + len] = 0; - if (pos < len) + if (pos < MIN(n, len)) { buf[align + pos] = seek_char; buf[align + len] = -seek_char; @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char) do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result); } +static void +do_overflow_tests (void) +{ + size_t i, j, len; + const size_t one = 1; + uintptr_t buf_addr = (uintptr_t) buf1; + + for (i = 0; i < 750; ++i) + { + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR); + do_test (0, i, 751, i - buf_addr, BIG_CHAR); + do_test (0, i, 751, -buf_addr - i, BIG_CHAR); + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR); + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR); + + len = 0; + for (j = 8 * sizeof(size_t) - 1; j ; --j) + { + len |= one << j; + do_test (0, i, 751, len - i, BIG_CHAR); + do_test (0, i, 751, len + i, BIG_CHAR); + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR); + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR); + + do_test (0, i, 751, ~len - i, BIG_CHAR); + do_test (0, i, 751, ~len + i, BIG_CHAR); + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR); + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR); + } + } +} + static void do_random_tests (void) { @@ -221,6 +253,7 @@ test_main (void) do_test (page_size / 2 - i, i, i, 1, 0x9B); do_random_tests (); + do_overflow_tests (); return ret; } diff --git a/string/test-strncat.c b/string/test-strncat.c index abbacb95c6..0c7f68d086 100644 --- a/string/test-strncat.c +++ b/string/test-strncat.c @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2, } } +static void +do_overflow_tests (void) +{ + size_t i, j, len; + const size_t one = 1; + CHAR *s1, *s2; + uintptr_t s1_addr; + s1 = (CHAR *) buf1; + s2 = (CHAR *) buf2; + s1_addr = (uintptr_t)s1; + for (j = 0; j < 200; ++j) + s2[j] = 32 + 23 * j % (BIG_CHAR - 32); + s2[200] = 0; + for (i = 0; i < 750; ++i) { + for (j = 0; j < i; ++j) + s1[j] = 32 + 23 * j % (BIG_CHAR - 32); + s1[i] = '\0'; + + FOR_EACH_IMPL (impl, 0) + { + s2[200] = '\0'; + do_one_test (impl, s2, s1, SIZE_MAX - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, i - s1_addr); + s2[200] = '\0'; + do_one_test (impl, s2, s1, -s1_addr - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i); + } + + len = 0; + for (j = 8 * sizeof(size_t) - 1; j ; --j) + { + len |= one << j; + FOR_EACH_IMPL (impl, 0) + { + s2[200] = '\0'; + do_one_test (impl, s2, s1, len - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, len + i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, len - s1_addr - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, len - s1_addr + i); + + s2[200] = '\0'; + do_one_test (impl, s2, s1, ~len - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, ~len + i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, ~len - s1_addr - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, ~len - s1_addr + i); + } + } + } +} + static void do_random_tests (void) { @@ -316,6 +376,7 @@ test_main (void) } do_random_tests (); + do_overflow_tests (); return ret; } diff --git a/string/test-strnlen.c b/string/test-strnlen.c index 80ac9e8602..a1a6746cc9 100644 --- a/string/test-strnlen.c +++ b/string/test-strnlen.c @@ -27,6 +27,7 @@ #ifndef WIDE # define STRNLEN strnlen +# define MEMSET memset # define CHAR char # define BIG_CHAR CHAR_MAX # define MIDDLE_CHAR 127 @@ -34,6 +35,7 @@ #else # include # define STRNLEN wcsnlen +# define MEMSET wmemset # define CHAR wchar_t # define BIG_CHAR WCHAR_MAX # define MIDDLE_CHAR 1121 @@ -87,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char) do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen)); } +static void +do_overflow_tests (void) +{ + size_t i, j, len; + const size_t one = 1; + uintptr_t buf_addr = (uintptr_t) buf1; + + for (i = 0; i < 750; ++i) + { + do_test (0, i, SIZE_MAX - i, BIG_CHAR); + do_test (0, i, i - buf_addr, BIG_CHAR); + do_test (0, i, -buf_addr - i, BIG_CHAR); + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR); + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR); + + len = 0; + for (j = 8 * sizeof(size_t) - 1; j ; --j) + { + len |= one << j; + do_test (0, i, len - i, BIG_CHAR); + do_test (0, i, len + i, BIG_CHAR); + do_test (0, i, len - buf_addr - i, BIG_CHAR); + do_test (0, i, len - buf_addr + i, BIG_CHAR); + + do_test (0, i, ~len - i, BIG_CHAR); + do_test (0, i, ~len + i, BIG_CHAR); + do_test (0, i, ~len - buf_addr - i, BIG_CHAR); + do_test (0, i, ~len - buf_addr + i, BIG_CHAR); + } + } +} + static void do_random_tests (void) { @@ -153,7 +187,7 @@ do_page_tests (void) size_t last_offset = (page_size / sizeof (CHAR)) - 1; CHAR *s = (CHAR *) buf2; - memset (s, 65, (last_offset - 1)); + MEMSET (s, 65, (last_offset - 1)); s[last_offset] = 0; /* Place short strings ending at page boundary. */ @@ -196,6 +230,35 @@ do_page_tests (void) } } +/* Tests meant to unveil fail on implementations that access bytes + beyond the maxium length. */ + +static void +do_page_2_tests (void) +{ + size_t i, exp_len, offset; + size_t last_offset = page_size / sizeof (CHAR); + + CHAR *s = (CHAR *) buf2; + MEMSET (s, 65, last_offset); + + /* Place short strings ending at page boundary without the null + byte. */ + offset = last_offset; + for (i = 0; i < 128; i++) + { + /* Decrease offset to stress several sizes and alignments. */ + offset--; + exp_len = last_offset - offset; + FOR_EACH_IMPL (impl, 0) + { + /* If an implementation goes beyond EXP_LEN, it will trigger + the segfault. */ + do_one_test (impl, (CHAR *) (s + offset), exp_len, exp_len); + } + } +} + int test_main (void) { @@ -242,6 +305,8 @@ test_main (void) do_random_tests (); do_page_tests (); + do_page_2_tests (); + do_overflow_tests (); return ret; } diff --git a/support/Makefile b/support/Makefile index 93faafddf9..3d3aff5ff9 100644 --- a/support/Makefile +++ b/support/Makefile @@ -35,6 +35,8 @@ libsupport-routines = \ ignore_stderr \ next_to_fault \ oom_error \ + resolv_response_context_duplicate \ + resolv_response_context_free \ resolv_test \ set_fortify_handler \ support-xfstat \ @@ -133,6 +135,7 @@ libsupport-routines = \ xpthread_join \ xpthread_key_create \ xpthread_key_delete \ + xpthread_kill \ xpthread_mutex_consistent \ xpthread_mutex_destroy \ xpthread_mutex_init \ diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h index 9808750f80..421f657678 100644 --- a/support/capture_subprocess.h +++ b/support/capture_subprocess.h @@ -41,6 +41,12 @@ struct support_capture_subprocess support_capture_subprocess struct support_capture_subprocess support_capture_subprogram (const char *file, char *const argv[]); +/* Copy the running program into a setgid binary and run it with CHILD_ID + argument. If execution is successful, return the exit status of the child + program, otherwise return a non-zero failure exit code. */ +int support_capture_subprogram_self_sgid + (char *child_id); + /* Deallocate the subprocess data captured by support_capture_subprocess. */ void support_capture_subprocess_free (struct support_capture_subprocess *); diff --git a/support/resolv_response_context_duplicate.c b/support/resolv_response_context_duplicate.c new file mode 100644 index 0000000000..f9c5c3462a --- /dev/null +++ b/support/resolv_response_context_duplicate.c @@ -0,0 +1,37 @@ +/* Duplicate a response context used in DNS resolver tests. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +struct resolv_response_context * +resolv_response_context_duplicate (const struct resolv_response_context *ctx) +{ + struct resolv_response_context *result = xmalloc (sizeof (*result)); + memcpy (result, ctx, sizeof (*result)); + if (result->client_address != NULL) + { + result->client_address = xmalloc (result->client_address_length); + memcpy (result->client_address, ctx->client_address, + result->client_address_length); + } + result->query_buffer = xmalloc (result->query_length); + memcpy (result->query_buffer, ctx->query_buffer, result->query_length); + return result; +} diff --git a/support/resolv_response_context_free.c b/support/resolv_response_context_free.c new file mode 100644 index 0000000000..b88c05ffd4 --- /dev/null +++ b/support/resolv_response_context_free.c @@ -0,0 +1,28 @@ +/* Free a response context used in DNS resolver tests. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +void +resolv_response_context_free (struct resolv_response_context *ctx) +{ + free (ctx->query_buffer); + free (ctx->client_address); + free (ctx); +} diff --git a/support/resolv_test.c b/support/resolv_test.c index 53b7fc41ab..9878a040a3 100644 --- a/support/resolv_test.c +++ b/support/resolv_test.c @@ -181,7 +181,9 @@ resolv_response_init (struct resolv_response_builder *b, b->buffer[2] |= b->query_buffer[2] & 0x01; /* Copy the RD bit. */ if (flags.tc) b->buffer[2] |= 0x02; - b->buffer[3] = 0x80 | flags.rcode; /* Always set RA. */ + b->buffer[3] = flags.rcode; + if (!flags.clear_ra) + b->buffer[3] |= 0x80; if (flags.ad) b->buffer[3] |= 0x20; @@ -434,9 +436,9 @@ resolv_response_buffer (const struct resolv_response_builder *b) return result; } -static struct resolv_response_builder * -response_builder_allocate - (const unsigned char *query_buffer, size_t query_length) +struct resolv_response_builder * +resolv_response_builder_allocate (const unsigned char *query_buffer, + size_t query_length) { struct resolv_response_builder *b = xmalloc (sizeof (*b)); memset (b, 0, offsetof (struct resolv_response_builder, buffer)); @@ -445,8 +447,8 @@ response_builder_allocate return b; } -static void -response_builder_free (struct resolv_response_builder *b) +void +resolv_response_builder_free (struct resolv_response_builder *b) { tdestroy (b->compression_offsets, free); free (b); @@ -661,13 +663,17 @@ server_thread_udp_process_one (struct resolv_test *obj, int server_index) struct resolv_response_context ctx = { + .test = obj, + .client_address = &peer, + .client_address_length = peerlen, .query_buffer = query, .query_length = length, .server_index = server_index, .tcp = false, .edns = qinfo.edns, }; - struct resolv_response_builder *b = response_builder_allocate (query, length); + struct resolv_response_builder *b + = resolv_response_builder_allocate (query, length); obj->config.response_callback (&ctx, b, qinfo.qname, qinfo.qclass, qinfo.qtype); @@ -684,7 +690,7 @@ server_thread_udp_process_one (struct resolv_test *obj, int server_index) if (b->offset >= 12) printf ("info: UDP server %d: sending response:" " %zu bytes, RCODE %d (for %s/%u/%u)\n", - server_index, b->offset, b->buffer[3] & 0x0f, + ctx.server_index, b->offset, b->buffer[3] & 0x0f, qinfo.qname, qinfo.qclass, qinfo.qtype); else printf ("info: UDP server %d: sending response: %zu bytes" @@ -694,23 +700,31 @@ server_thread_udp_process_one (struct resolv_test *obj, int server_index) if (b->truncate_bytes > 0) printf ("info: truncated by %u bytes\n", b->truncate_bytes); } - size_t to_send = b->offset; - if (to_send < b->truncate_bytes) - to_send = 0; - else - to_send -= b->truncate_bytes; - - /* Ignore most errors here because the other end may have closed - the socket. */ - if (sendto (obj->servers[server_index].socket_udp, - b->buffer, to_send, 0, - (struct sockaddr *) &peer, peerlen) < 0) - TEST_VERIFY_EXIT (errno != EBADF); + resolv_response_send_udp (&ctx, b); } - response_builder_free (b); + resolv_response_builder_free (b); return true; } +void +resolv_response_send_udp (const struct resolv_response_context *ctx, + struct resolv_response_builder *b) +{ + TEST_VERIFY_EXIT (!ctx->tcp); + size_t to_send = b->offset; + if (to_send < b->truncate_bytes) + to_send = 0; + else + to_send -= b->truncate_bytes; + + /* Ignore most errors here because the other end may have closed + the socket. */ + if (sendto (ctx->test->servers[ctx->server_index].socket_udp, + b->buffer, to_send, 0, + ctx->client_address, ctx->client_address_length) < 0) + TEST_VERIFY_EXIT (errno != EBADF); +} + /* UDP thread_callback function. Variant for one thread per server. */ static void @@ -897,14 +911,15 @@ server_thread_tcp_client (void *arg) struct resolv_response_context ctx = { + .test = closure->obj, .query_buffer = query_buffer, .query_length = query_length, .server_index = closure->server_index, .tcp = true, .edns = qinfo.edns, }; - struct resolv_response_builder *b = response_builder_allocate - (query_buffer, query_length); + struct resolv_response_builder *b + = resolv_response_builder_allocate (query_buffer, query_length); closure->obj->config.response_callback (&ctx, b, qinfo.qname, qinfo.qclass, qinfo.qtype); @@ -936,7 +951,7 @@ server_thread_tcp_client (void *arg) writev_fully (closure->client_socket, buffers, 2); } bool close_flag = b->close; - response_builder_free (b); + resolv_response_builder_free (b); free (query_buffer); if (close_flag) break; diff --git a/support/resolv_test.h b/support/resolv_test.h index 67819469a0..31a5c1c3e7 100644 --- a/support/resolv_test.h +++ b/support/resolv_test.h @@ -35,25 +35,36 @@ struct resolv_edns_info uint16_t payload_size; }; +/* This opaque struct collects information about the resolver testing + currently in progress. */ +struct resolv_test; + /* This struct provides context information when the response callback specified in struct resolv_redirect_config is invoked. */ struct resolv_response_context { - const unsigned char *query_buffer; + struct resolv_test *test; + void *client_address; + size_t client_address_length; + unsigned char *query_buffer; size_t query_length; int server_index; bool tcp; struct resolv_edns_info edns; }; +/* Produces a deep copy of the context. */ +struct resolv_response_context * + resolv_response_context_duplicate (const struct resolv_response_context *); + +/* Frees the copy. For the context passed to the response function, + this happens implicitly. */ +void resolv_response_context_free (struct resolv_response_context *); + /* This opaque struct is used to construct responses from within the response callback function. */ struct resolv_response_builder; -/* This opaque struct collects information about the resolver testing - currently in progress. */ -struct resolv_test; - enum { /* Maximum number of test servers supported by the framework. */ @@ -137,6 +148,10 @@ struct resolv_response_flags /* If true, the AD (authenticated data) flag will be set. */ bool ad; + /* If true, do not set the RA (recursion available) flag in the + response. */ + bool clear_ra; + /* Initial section count values. Can be used to artificially increase the counts, for malformed packet testing.*/ unsigned short qdcount; @@ -188,6 +203,22 @@ void resolv_response_close (struct resolv_response_builder *); /* The size of the response packet built so far. */ size_t resolv_response_length (const struct resolv_response_builder *); +/* Allocates a response builder tied to a specific query packet, + starting at QUERY_BUFFER, containing QUERY_LENGTH bytes. */ +struct resolv_response_builder * + resolv_response_builder_allocate (const unsigned char *query_buffer, + size_t query_length); + +/* Deallocates a response buffer. */ +void resolv_response_builder_free (struct resolv_response_builder *); + +/* Sends a UDP response using a specific context. This can be used to + reorder or duplicate responses, along with + resolv_response_context_duplicate and + response_builder_allocate. */ +void resolv_response_send_udp (const struct resolv_response_context *, + struct resolv_response_builder *); + __END_DECLS #endif /* SUPPORT_RESOLV_TEST_H */ diff --git a/support/subprocess.h b/support/subprocess.h index 8b442fd5c0..34ffd02e8e 100644 --- a/support/subprocess.h +++ b/support/subprocess.h @@ -38,6 +38,11 @@ struct support_subprocess support_subprocess struct support_subprocess support_subprogram (const char *file, char *const argv[]); +/* Invoke program FILE with ARGV arguments by using posix_spawn and wait for it + to complete. Return program exit status. */ +int support_subprogram_wait + (const char *file, char *const argv[]); + /* Wait for the subprocess indicated by PROC::PID. Return the status indicate by waitpid call. */ int support_process_wait (struct support_subprocess *proc); diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c index eeed676e3d..28a37df67f 100644 --- a/support/support_capture_subprocess.c +++ b/support/support_capture_subprocess.c @@ -20,11 +20,14 @@ #include #include +#include #include #include #include #include #include +#include +#include static void transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream) @@ -36,7 +39,7 @@ transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream) if (ret < 0) { support_record_failure (); - printf ("error: reading from subprocess %s: %m", what); + printf ("error: reading from subprocess %s: %m\n", what); pfd->events = 0; pfd->revents = 0; } @@ -102,6 +105,129 @@ support_capture_subprogram (const char *file, char *const argv[]) return result; } +/* Copies the executable into a restricted directory, so that we can + safely make it SGID with the TARGET group ID. Then runs the + executable. */ +static int +copy_and_spawn_sgid (char *child_id, gid_t gid) +{ + char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", + test_dir, (intmax_t) getpid ()); + char *execname = xasprintf ("%s/bin", dirname); + int infd = -1; + int outfd = -1; + int ret = 1, status = 1; + + TEST_VERIFY (mkdir (dirname, 0700) == 0); + if (support_record_failure_is_failed ()) + goto err; + + infd = open ("/proc/self/exe", O_RDONLY); + if (infd < 0) + FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n"); + + outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); + TEST_VERIFY (outfd >= 0); + if (support_record_failure_is_failed ()) + goto err; + + char buf[4096]; + for (;;) + { + ssize_t rdcount = read (infd, buf, sizeof (buf)); + TEST_VERIFY (rdcount >= 0); + if (support_record_failure_is_failed ()) + goto err; + if (rdcount == 0) + break; + char *p = buf; + char *end = buf + rdcount; + while (p != end) + { + ssize_t wrcount = write (outfd, buf, end - p); + if (wrcount == 0) + errno = ENOSPC; + TEST_VERIFY (wrcount > 0); + if (support_record_failure_is_failed ()) + goto err; + p += wrcount; + } + } + TEST_VERIFY (fchown (outfd, getuid (), gid) == 0); + if (support_record_failure_is_failed ()) + goto err; + TEST_VERIFY (fchmod (outfd, 02750) == 0); + if (support_record_failure_is_failed ()) + goto err; + TEST_VERIFY (close (outfd) == 0); + if (support_record_failure_is_failed ()) + goto err; + TEST_VERIFY (close (infd) == 0); + if (support_record_failure_is_failed ()) + goto err; + + /* We have the binary, now spawn the subprocess. Avoid using + support_subprogram because we only want the program exit status, not the + contents. */ + ret = 0; + + char * const args[] = {execname, child_id, NULL}; + + status = support_subprogram_wait (args[0], args); + +err: + if (outfd >= 0) + close (outfd); + if (infd >= 0) + close (infd); + if (execname != NULL) + { + unlink (execname); + free (execname); + } + if (dirname != NULL) + { + rmdir (dirname); + free (dirname); + } + + if (ret != 0) + FAIL_EXIT1("Failed to make sgid executable for test\n"); + + return status; +} + +int +support_capture_subprogram_self_sgid (char *child_id) +{ + gid_t target = 0; + const int count = 64; + gid_t groups[count]; + + /* Get a GID which is not our current GID, but is present in the + supplementary group list. */ + int ret = getgroups (count, groups); + if (ret < 0) + FAIL_UNSUPPORTED("Could not get group list for user %jd\n", + (intmax_t) getuid ()); + + gid_t current = getgid (); + for (int i = 0; i < ret; ++i) + { + if (groups[i] != current) + { + target = groups[i]; + break; + } + } + + if (target == 0) + FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", + (intmax_t) getuid ()); + + return copy_and_spawn_sgid (child_id, target); +} + void support_capture_subprocess_free (struct support_capture_subprocess *p) { diff --git a/support/support_subprocess.c b/support/support_subprocess.c index 36e3a77af2..4a25828111 100644 --- a/support/support_subprocess.c +++ b/support/support_subprocess.c @@ -27,7 +27,7 @@ #include static struct support_subprocess -support_suprocess_init (void) +support_subprocess_init (void) { struct support_subprocess result; @@ -48,7 +48,7 @@ support_suprocess_init (void) struct support_subprocess support_subprocess (void (*callback) (void *), void *closure) { - struct support_subprocess result = support_suprocess_init (); + struct support_subprocess result = support_subprocess_init (); result.pid = xfork (); if (result.pid == 0) @@ -71,7 +71,7 @@ support_subprocess (void (*callback) (void *), void *closure) struct support_subprocess support_subprogram (const char *file, char *const argv[]) { - struct support_subprocess result = support_suprocess_init (); + struct support_subprocess result = support_subprocess_init (); posix_spawn_file_actions_t fa; /* posix_spawn_file_actions_init does not fail. */ @@ -84,7 +84,7 @@ support_subprogram (const char *file, char *const argv[]) xposix_spawn_file_actions_addclose (&fa, result.stdout_pipe[1]); xposix_spawn_file_actions_addclose (&fa, result.stderr_pipe[1]); - result.pid = xposix_spawn (file, &fa, NULL, argv, NULL); + result.pid = xposix_spawn (file, &fa, NULL, argv, environ); xclose (result.stdout_pipe[1]); xclose (result.stderr_pipe[1]); @@ -92,6 +92,19 @@ support_subprogram (const char *file, char *const argv[]) return result; } +int +support_subprogram_wait (const char *file, char *const argv[]) +{ + posix_spawn_file_actions_t fa; + + posix_spawn_file_actions_init (&fa); + struct support_subprocess res = support_subprocess_init (); + + res.pid = xposix_spawn (file, &fa, NULL, argv, environ); + + return support_process_wait (&res); +} + int support_process_wait (struct support_subprocess *proc) { diff --git a/support/xpthread_kill.c b/support/xpthread_kill.c new file mode 100644 index 0000000000..111a75d85e --- /dev/null +++ b/support/xpthread_kill.c @@ -0,0 +1,26 @@ +/* pthread_kill with error checking. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +void +xpthread_kill (pthread_t thr, int signo) +{ + xpthread_check_return ("pthread_kill", pthread_kill (thr, signo)); +} diff --git a/support/xthread.h b/support/xthread.h index 05f8d4a7d9..cb1fc30da0 100644 --- a/support/xthread.h +++ b/support/xthread.h @@ -75,6 +75,8 @@ void xpthread_attr_setstacksize (pthread_attr_t *attr, void xpthread_attr_setguardsize (pthread_attr_t *attr, size_t guardsize); +void xpthread_kill (pthread_t thr, int signo); + /* Set the stack size in ATTR to a small value, but still large enough to cover most internal glibc stack usage. */ void support_set_small_thread_stack_size (pthread_attr_t *attr); diff --git a/sysdeps/aarch64/dl-bti.c b/sysdeps/aarch64/dl-bti.c index 196e462520..cf7624aaa2 100644 --- a/sysdeps/aarch64/dl-bti.c +++ b/sysdeps/aarch64/dl-bti.c @@ -19,43 +19,76 @@ #include #include #include +#include -static int -enable_bti (struct link_map *map, const char *program) +/* See elf/dl-load.h. */ +#ifndef MAP_COPY +# define MAP_COPY (MAP_PRIVATE | MAP_DENYWRITE) +#endif + +/* Enable BTI protection for MAP. */ + +void +_dl_bti_protect (struct link_map *map, int fd) { + const size_t pagesz = GLRO(dl_pagesize); const ElfW(Phdr) *phdr; - unsigned prot; for (phdr = map->l_phdr; phdr < &map->l_phdr[map->l_phnum]; ++phdr) if (phdr->p_type == PT_LOAD && (phdr->p_flags & PF_X)) { - void *start = (void *) (phdr->p_vaddr + map->l_addr); - size_t len = phdr->p_memsz; + size_t vstart = ALIGN_DOWN (phdr->p_vaddr, pagesz); + size_t vend = ALIGN_UP (phdr->p_vaddr + phdr->p_filesz, pagesz); + off_t off = ALIGN_DOWN (phdr->p_offset, pagesz); + void *start = (void *) (vstart + map->l_addr); + size_t len = vend - vstart; - prot = PROT_EXEC | PROT_BTI; + unsigned prot = PROT_EXEC | PROT_BTI; if (phdr->p_flags & PF_R) prot |= PROT_READ; if (phdr->p_flags & PF_W) prot |= PROT_WRITE; - if (__mprotect (start, len, prot) < 0) - { - if (program) - _dl_fatal_printf ("%s: mprotect failed to turn on BTI\n", - map->l_name); - else - _dl_signal_error (errno, map->l_name, "dlopen", - N_("mprotect failed to turn on BTI")); - } + if (fd == -1) + /* Ignore failures for kernel mapped binaries. */ + __mprotect (start, len, prot); + else + map->l_mach.bti_fail = __mmap (start, len, prot, + MAP_FIXED|MAP_COPY|MAP_FILE, + fd, off) == MAP_FAILED; } - return 0; } -/* Enable BTI for L if required. */ + +static void +bti_failed (struct link_map *l, const char *program) +{ + if (program) + _dl_fatal_printf ("%s: %s: failed to turn on BTI protection\n", + program, l->l_name); + else + /* Note: the errno value is not available any more. */ + _dl_signal_error (0, l->l_name, "dlopen", + N_("failed to turn on BTI protection")); +} + + +/* Enable BTI for L and its dependencies. */ void _dl_bti_check (struct link_map *l, const char *program) { - if (GLRO(dl_aarch64_cpu_features).bti && l->l_mach.bti) - enable_bti (l, program); + if (!GLRO(dl_aarch64_cpu_features).bti) + return; + + if (l->l_mach.bti_fail) + bti_failed (l, program); + + unsigned int i = l->l_searchlist.r_nlist; + while (i-- > 0) + { + struct link_map *dep = l->l_initfini[i]; + if (dep->l_mach.bti_fail) + bti_failed (dep, program); + } } diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h index 70b9ed3925..fde7cfd9e2 100644 --- a/sysdeps/aarch64/dl-machine.h +++ b/sysdeps/aarch64/dl-machine.h @@ -395,13 +395,6 @@ elf_machine_lazy_rel (struct link_map *map, /* Check for unexpected PLT reloc type. */ if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1)) { - if (map->l_mach.plt == 0) - { - /* Prelinking. */ - *reloc_addr += l_addr; - return; - } - if (__glibc_unlikely (map->l_info[DT_AARCH64 (VARIANT_PCS)] != NULL)) { /* Check the symbol table for variant PCS symbols. */ @@ -425,7 +418,10 @@ elf_machine_lazy_rel (struct link_map *map, } } - *reloc_addr = map->l_mach.plt; + if (map->l_mach.plt == 0) + *reloc_addr += l_addr; + else + *reloc_addr = map->l_mach.plt; } else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1)) { diff --git a/sysdeps/aarch64/dl-prop.h b/sysdeps/aarch64/dl-prop.h index b0785bda83..e926e54984 100644 --- a/sysdeps/aarch64/dl-prop.h +++ b/sysdeps/aarch64/dl-prop.h @@ -19,6 +19,8 @@ #ifndef _DL_PROP_H #define _DL_PROP_H +extern void _dl_bti_protect (struct link_map *, int) attribute_hidden; + extern void _dl_bti_check (struct link_map *, const char *) attribute_hidden; @@ -35,14 +37,18 @@ _dl_open_check (struct link_map *m) } static inline void __attribute__ ((always_inline)) -_dl_process_pt_note (struct link_map *l, const ElfW(Phdr) *ph) +_dl_process_pt_note (struct link_map *l, int fd, const ElfW(Phdr) *ph) { } static inline int -_dl_process_gnu_property (struct link_map *l, uint32_t type, uint32_t datasz, - void *data) +_dl_process_gnu_property (struct link_map *l, int fd, uint32_t type, + uint32_t datasz, void *data) { + if (!GLRO(dl_aarch64_cpu_features).bti) + /* Skip note processing. */ + return 0; + if (type == GNU_PROPERTY_AARCH64_FEATURE_1_AND) { /* Stop if the property note is ill-formed. */ @@ -51,7 +57,7 @@ _dl_process_gnu_property (struct link_map *l, uint32_t type, uint32_t datasz, unsigned int feature_1 = *(unsigned int *) data; if (feature_1 & GNU_PROPERTY_AARCH64_FEATURE_1_BTI) - l->l_mach.bti = true; + _dl_bti_protect (l, fd); /* Stop if we processed the property note. */ return 0; diff --git a/sysdeps/aarch64/linkmap.h b/sysdeps/aarch64/linkmap.h index 847a03ace2..b3f7663b07 100644 --- a/sysdeps/aarch64/linkmap.h +++ b/sysdeps/aarch64/linkmap.h @@ -22,5 +22,5 @@ struct link_map_machine { ElfW(Addr) plt; /* Address of .plt */ void *tlsdesc_table; /* Address of TLS descriptor hash table. */ - bool bti; /* Branch Target Identification is enabled. */ + bool bti_fail; /* Failed to enable Branch Target Identification. */ }; diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 7cf5f033e8..799d60c98c 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -41,7 +41,8 @@ libc_ifunc (__libc_memcpy, ? __memcpy_falkor : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) ? __memcpy_thunderx2 - : (IS_NEOVERSE_N1 (midr) + : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) + || IS_NEOVERSE_V1 (midr) ? __memcpy_simd : __memcpy_generic))))); diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S index d4ba747777..48bb6d7ca4 100644 --- a/sysdeps/aarch64/multiarch/memcpy_advsimd.S +++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S @@ -223,12 +223,13 @@ L(copy_long_backwards): b.ls L(copy64_from_start) L(loop64_backwards): - stp A_q, B_q, [dstend, -32] + str B_q, [dstend, -16] + str A_q, [dstend, -32] ldp A_q, B_q, [srcend, -96] - stp C_q, D_q, [dstend, -64] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! ldp C_q, D_q, [srcend, -128] sub srcend, srcend, 64 - sub dstend, dstend, 64 subs count, count, 64 b.hi L(loop64_backwards) diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index ad10aa8ac6..46a4cb3a54 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -41,7 +41,8 @@ libc_ifunc (__libc_memmove, ? __memmove_falkor : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) ? __memmove_thunderx2 - : (IS_NEOVERSE_N1 (midr) + : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) + || IS_NEOVERSE_V1 (midr) ? __memmove_simd : __memmove_generic))))); diff --git a/sysdeps/aarch64/start.S b/sysdeps/aarch64/start.S index 75393e1c18..1998ea95d4 100644 --- a/sysdeps/aarch64/start.S +++ b/sysdeps/aarch64/start.S @@ -43,11 +43,9 @@ */ .text - .globl _start - .type _start,#function -_start: - BTI_C +ENTRY(_start) /* Create an initial frame with 0 LR and FP */ + cfi_undefined (x30) mov x29, #0 mov x30, #0 @@ -101,8 +99,10 @@ _start: because crt1.o and rcrt1.o share code and the later must avoid the use of GOT relocations before __libc_start_main is called. */ __wrap_main: + BTI_C b main #endif +END(_start) /* Define a symbol for the first piece of initialized data. */ .data diff --git a/sysdeps/generic/dl-prop.h b/sysdeps/generic/dl-prop.h index f1cf576fe3..df27ff8e6a 100644 --- a/sysdeps/generic/dl-prop.h +++ b/sysdeps/generic/dl-prop.h @@ -37,15 +37,15 @@ _dl_open_check (struct link_map *m) } static inline void __attribute__ ((always_inline)) -_dl_process_pt_note (struct link_map *l, const ElfW(Phdr) *ph) +_dl_process_pt_note (struct link_map *l, int fd, const ElfW(Phdr) *ph) { } /* Called for each property in the NT_GNU_PROPERTY_TYPE_0 note of L, processing of the properties continues until this returns 0. */ static inline int __attribute__ ((always_inline)) -_dl_process_gnu_property (struct link_map *l, uint32_t type, uint32_t datasz, - void *data) +_dl_process_gnu_property (struct link_map *l, int fd, uint32_t type, + uint32_t datasz, void *data) { return 0; } diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index ba114ab4b1..62ac40d81b 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -919,8 +919,9 @@ extern void _dl_rtld_di_serinfo (struct link_map *loader, Dl_serinfo *si, bool counting); /* Process PT_GNU_PROPERTY program header PH in module L after - PT_LOAD segments are mapped. */ -void _dl_process_pt_gnu_property (struct link_map *l, const ElfW(Phdr) *ph); + PT_LOAD segments are mapped from file FD. */ +void _dl_process_pt_gnu_property (struct link_map *l, int fd, + const ElfW(Phdr) *ph); /* Search loaded objects' symbol tables for a definition of the symbol diff --git a/sysdeps/generic/unwind.h b/sysdeps/generic/unwind.h index b667a5b652..c229603af3 100644 --- a/sysdeps/generic/unwind.h +++ b/sysdeps/generic/unwind.h @@ -75,15 +75,21 @@ typedef void (*_Unwind_Exception_Cleanup_Fn) (_Unwind_Reason_Code, struct _Unwind_Exception { - _Unwind_Exception_Class exception_class; - _Unwind_Exception_Cleanup_Fn exception_cleanup; - _Unwind_Word private_1; - _Unwind_Word private_2; - - /* @@@ The IA-64 ABI says that this structure must be double-word aligned. - Taking that literally does not make much sense generically. Instead we - provide the maximum alignment required by any type for the machine. */ -} __attribute__((__aligned__)); + union + { + struct + { + _Unwind_Exception_Class exception_class; + _Unwind_Exception_Cleanup_Fn exception_cleanup; + _Unwind_Word private_1; + _Unwind_Word private_2; + }; + + /* The IA-64 ABI says that this structure must be double-word aligned. */ + _Unwind_Word unwind_exception_align[2] + __attribute__ ((__aligned__ (2 * sizeof (_Unwind_Word)))); + }; +}; /* The ACTIONS argument to the personality routine is a bitwise OR of one diff --git a/sysdeps/gnu/errlist.h b/sysdeps/gnu/errlist.h index 5d11ed723d..6329e5f393 100644 --- a/sysdeps/gnu/errlist.h +++ b/sysdeps/gnu/errlist.h @@ -1,24 +1,21 @@ -#ifndef ERR_MAP -#define ERR_MAP(value) value -#endif -_S(ERR_MAP(0), N_("Success")) +_S(0, N_("Success")) #ifdef EPERM /* TRANS Only the owner of the file (or other resource) TRANS or processes with special privileges can perform the operation. */ -_S(ERR_MAP(EPERM), N_("Operation not permitted")) +_S(EPERM, N_("Operation not permitted")) #endif #ifdef ENOENT /* TRANS This is a ``file doesn't exist'' error TRANS for ordinary files that are referenced in contexts where they are TRANS expected to already exist. */ -_S(ERR_MAP(ENOENT), N_("No such file or directory")) +_S(ENOENT, N_("No such file or directory")) #endif #ifdef ESRCH /* TRANS No process matches the specified process ID. */ -_S(ERR_MAP(ESRCH), N_("No such process")) +_S(ESRCH, N_("No such process")) #endif #ifdef EINTR /* @@ -29,12 +26,12 @@ TRANS TRANS You can choose to have functions resume after a signal that is handled, TRANS rather than failing with @code{EINTR}; see @ref{Interrupted TRANS Primitives}. */ -_S(ERR_MAP(EINTR), N_("Interrupted system call")) +_S(EINTR, N_("Interrupted system call")) #endif #ifdef EIO /* TRANS Usually used for physical read or write errors. */ -_S(ERR_MAP(EIO), N_("Input/output error")) +_S(EIO, N_("Input/output error")) #endif #ifdef ENXIO /* @@ -43,7 +40,7 @@ TRANS represented by a file you specified, and it couldn't find the device. TRANS This can mean that the device file was installed incorrectly, or that TRANS the physical device is missing or not correctly attached to the TRANS computer. */ -_S(ERR_MAP(ENXIO), N_("No such device or address")) +_S(ENXIO, N_("No such device or address")) #endif #ifdef E2BIG /* @@ -51,27 +48,27 @@ TRANS Used when the arguments passed to a new program TRANS being executed with one of the @code{exec} functions (@pxref{Executing a TRANS File}) occupy too much memory space. This condition never arises on TRANS @gnuhurdsystems{}. */ -_S(ERR_MAP(E2BIG), N_("Argument list too long")) +_S(E2BIG, N_("Argument list too long")) #endif #ifdef ENOEXEC /* TRANS Invalid executable file format. This condition is detected by the TRANS @code{exec} functions; see @ref{Executing a File}. */ -_S(ERR_MAP(ENOEXEC), N_("Exec format error")) +_S(ENOEXEC, N_("Exec format error")) #endif #ifdef EBADF /* TRANS For example, I/O on a descriptor that has been TRANS closed or reading from a descriptor open only for writing (or vice TRANS versa). */ -_S(ERR_MAP(EBADF), N_("Bad file descriptor")) +_S(EBADF, N_("Bad file descriptor")) #endif #ifdef ECHILD /* TRANS This error happens on operations that are TRANS supposed to manipulate child processes, when there aren't any processes TRANS to manipulate. */ -_S(ERR_MAP(ECHILD), N_("No child processes")) +_S(ECHILD, N_("No child processes")) #endif #ifdef EDEADLK /* @@ -79,74 +76,74 @@ TRANS Allocating a system resource would have resulted in a TRANS deadlock situation. The system does not guarantee that it will notice TRANS all such situations. This error means you got lucky and the system TRANS noticed; it might just hang. @xref{File Locks}, for an example. */ -_S(ERR_MAP(EDEADLK), N_("Resource deadlock avoided")) +_S(EDEADLK, N_("Resource deadlock avoided")) #endif #ifdef ENOMEM /* TRANS The system cannot allocate more virtual memory TRANS because its capacity is full. */ -_S(ERR_MAP(ENOMEM), N_("Cannot allocate memory")) +_S(ENOMEM, N_("Cannot allocate memory")) #endif #ifdef EACCES /* TRANS The file permissions do not allow the attempted operation. */ -_S(ERR_MAP(EACCES), N_("Permission denied")) +_S(EACCES, N_("Permission denied")) #endif #ifdef EFAULT /* TRANS An invalid pointer was detected. TRANS On @gnuhurdsystems{}, this error never happens; you get a signal instead. */ -_S(ERR_MAP(EFAULT), N_("Bad address")) +_S(EFAULT, N_("Bad address")) #endif #ifdef ENOTBLK /* TRANS A file that isn't a block special file was given in a situation that TRANS requires one. For example, trying to mount an ordinary file as a file TRANS system in Unix gives this error. */ -_S(ERR_MAP(ENOTBLK), N_("Block device required")) +_S(ENOTBLK, N_("Block device required")) #endif #ifdef EBUSY /* TRANS A system resource that can't be shared is already in use. TRANS For example, if you try to delete a file that is the root of a currently TRANS mounted filesystem, you get this error. */ -_S(ERR_MAP(EBUSY), N_("Device or resource busy")) +_S(EBUSY, N_("Device or resource busy")) #endif #ifdef EEXIST /* TRANS An existing file was specified in a context where it only TRANS makes sense to specify a new file. */ -_S(ERR_MAP(EEXIST), N_("File exists")) +_S(EEXIST, N_("File exists")) #endif #ifdef EXDEV /* TRANS An attempt to make an improper link across file systems was detected. TRANS This happens not only when you use @code{link} (@pxref{Hard Links}) but TRANS also when you rename a file with @code{rename} (@pxref{Renaming Files}). */ -_S(ERR_MAP(EXDEV), N_("Invalid cross-device link")) +_S(EXDEV, N_("Invalid cross-device link")) #endif #ifdef ENODEV /* TRANS The wrong type of device was given to a function that expects a TRANS particular sort of device. */ -_S(ERR_MAP(ENODEV), N_("No such device")) +_S(ENODEV, N_("No such device")) #endif #ifdef ENOTDIR /* TRANS A file that isn't a directory was specified when a directory is required. */ -_S(ERR_MAP(ENOTDIR), N_("Not a directory")) +_S(ENOTDIR, N_("Not a directory")) #endif #ifdef EISDIR /* TRANS You cannot open a directory for writing, TRANS or create or remove hard links to it. */ -_S(ERR_MAP(EISDIR), N_("Is a directory")) +_S(EISDIR, N_("Is a directory")) #endif #ifdef EINVAL /* TRANS This is used to indicate various kinds of problems TRANS with passing the wrong argument to a library function. */ -_S(ERR_MAP(EINVAL), N_("Invalid argument")) +_S(EINVAL, N_("Invalid argument")) #endif #ifdef EMFILE /* @@ -157,20 +154,20 @@ TRANS In BSD and GNU, the number of open files is controlled by a resource TRANS limit that can usually be increased. If you get this error, you might TRANS want to increase the @code{RLIMIT_NOFILE} limit or make it unlimited; TRANS @pxref{Limits on Resources}. */ -_S(ERR_MAP(EMFILE), N_("Too many open files")) +_S(EMFILE, N_("Too many open files")) #endif #ifdef ENFILE /* TRANS There are too many distinct file openings in the entire system. Note TRANS that any number of linked channels count as just one file opening; see TRANS @ref{Linked Channels}. This error never occurs on @gnuhurdsystems{}. */ -_S(ERR_MAP(ENFILE), N_("Too many open files in system")) +_S(ENFILE, N_("Too many open files in system")) #endif #ifdef ENOTTY /* TRANS Inappropriate I/O control operation, such as trying to set terminal TRANS modes on an ordinary file. */ -_S(ERR_MAP(ENOTTY), N_("Inappropriate ioctl for device")) +_S(ENOTTY, N_("Inappropriate ioctl for device")) #endif #ifdef ETXTBSY /* @@ -179,35 +176,35 @@ TRANS write to a file that is currently being executed. Often using a TRANS debugger to run a program is considered having it open for writing and TRANS will cause this error. (The name stands for ``text file busy''.) This TRANS is not an error on @gnuhurdsystems{}; the text is copied as necessary. */ -_S(ERR_MAP(ETXTBSY), N_("Text file busy")) +_S(ETXTBSY, N_("Text file busy")) #endif #ifdef EFBIG /* TRANS The size of a file would be larger than allowed by the system. */ -_S(ERR_MAP(EFBIG), N_("File too large")) +_S(EFBIG, N_("File too large")) #endif #ifdef ENOSPC /* TRANS Write operation on a file failed because the TRANS disk is full. */ -_S(ERR_MAP(ENOSPC), N_("No space left on device")) +_S(ENOSPC, N_("No space left on device")) #endif #ifdef ESPIPE /* TRANS Invalid seek operation (such as on a pipe). */ -_S(ERR_MAP(ESPIPE), N_("Illegal seek")) +_S(ESPIPE, N_("Illegal seek")) #endif #ifdef EROFS /* TRANS An attempt was made to modify something on a read-only file system. */ -_S(ERR_MAP(EROFS), N_("Read-only file system")) +_S(EROFS, N_("Read-only file system")) #endif #ifdef EMLINK /* TRANS The link count of a single file would become too large. TRANS @code{rename} can cause this error if the file being renamed already has TRANS as many links as it can take (@pxref{Renaming Files}). */ -_S(ERR_MAP(EMLINK), N_("Too many links")) +_S(EMLINK, N_("Too many links")) #endif #ifdef EPIPE /* @@ -216,19 +213,19 @@ TRANS Every library function that returns this error code also generates a TRANS @code{SIGPIPE} signal; this signal terminates the program if not handled TRANS or blocked. Thus, your program will never actually see @code{EPIPE} TRANS unless it has handled or blocked @code{SIGPIPE}. */ -_S(ERR_MAP(EPIPE), N_("Broken pipe")) +_S(EPIPE, N_("Broken pipe")) #endif #ifdef EDOM /* TRANS Used by mathematical functions when an argument value does TRANS not fall into the domain over which the function is defined. */ -_S(ERR_MAP(EDOM), N_("Numerical argument out of domain")) +_S(EDOM, N_("Numerical argument out of domain")) #endif #ifdef ERANGE /* TRANS Used by mathematical functions when the result value is TRANS not representable because of overflow or underflow. */ -_S(ERR_MAP(ERANGE), N_("Numerical result out of range")) +_S(ERANGE, N_("Numerical result out of range")) #endif #ifdef EAGAIN /* @@ -261,7 +258,7 @@ TRANS Such shortages are usually fairly serious and affect the whole system, TRANS so usually an interactive program should report the error to the user TRANS and return to its command loop. TRANS @end itemize */ -_S(ERR_MAP(EAGAIN), N_("Resource temporarily unavailable")) +_S(EAGAIN, N_("Resource temporarily unavailable")) #endif #ifdef EINPROGRESS /* @@ -273,47 +270,47 @@ TRANS the operation has begun and will take some time. Attempts to manipulate TRANS the object before the call completes return @code{EALREADY}. You can TRANS use the @code{select} function to find out when the pending operation TRANS has completed; @pxref{Waiting for I/O}. */ -_S(ERR_MAP(EINPROGRESS), N_("Operation now in progress")) +_S(EINPROGRESS, N_("Operation now in progress")) #endif #ifdef EALREADY /* TRANS An operation is already in progress on an object that has non-blocking TRANS mode selected. */ -_S(ERR_MAP(EALREADY), N_("Operation already in progress")) +_S(EALREADY, N_("Operation already in progress")) #endif #ifdef ENOTSOCK /* TRANS A file that isn't a socket was specified when a socket is required. */ -_S(ERR_MAP(ENOTSOCK), N_("Socket operation on non-socket")) +_S(ENOTSOCK, N_("Socket operation on non-socket")) #endif #ifdef EMSGSIZE /* TRANS The size of a message sent on a socket was larger than the supported TRANS maximum size. */ -_S(ERR_MAP(EMSGSIZE), N_("Message too long")) +_S(EMSGSIZE, N_("Message too long")) #endif #ifdef EPROTOTYPE /* TRANS The socket type does not support the requested communications protocol. */ -_S(ERR_MAP(EPROTOTYPE), N_("Protocol wrong type for socket")) +_S(EPROTOTYPE, N_("Protocol wrong type for socket")) #endif #ifdef ENOPROTOOPT /* TRANS You specified a socket option that doesn't make sense for the TRANS particular protocol being used by the socket. @xref{Socket Options}. */ -_S(ERR_MAP(ENOPROTOOPT), N_("Protocol not available")) +_S(ENOPROTOOPT, N_("Protocol not available")) #endif #ifdef EPROTONOSUPPORT /* TRANS The socket domain does not support the requested communications protocol TRANS (perhaps because the requested protocol is completely invalid). TRANS @xref{Creating a Socket}. */ -_S(ERR_MAP(EPROTONOSUPPORT), N_("Protocol not supported")) +_S(EPROTONOSUPPORT, N_("Protocol not supported")) #endif #ifdef ESOCKTNOSUPPORT /* TRANS The socket type is not supported. */ -_S(ERR_MAP(ESOCKTNOSUPPORT), N_("Socket type not supported")) +_S(ESOCKTNOSUPPORT, N_("Socket type not supported")) #endif #ifdef EOPNOTSUPP /* @@ -323,71 +320,71 @@ TRANS implemented for all communications protocols. On @gnuhurdsystems{}, this TRANS error can happen for many calls when the object does not support the TRANS particular operation; it is a generic indication that the server knows TRANS nothing to do for that call. */ -_S(ERR_MAP(EOPNOTSUPP), N_("Operation not supported")) +_S(EOPNOTSUPP, N_("Operation not supported")) #endif #ifdef EPFNOSUPPORT /* TRANS The socket communications protocol family you requested is not supported. */ -_S(ERR_MAP(EPFNOSUPPORT), N_("Protocol family not supported")) +_S(EPFNOSUPPORT, N_("Protocol family not supported")) #endif #ifdef EAFNOSUPPORT /* TRANS The address family specified for a socket is not supported; it is TRANS inconsistent with the protocol being used on the socket. @xref{Sockets}. */ -_S(ERR_MAP(EAFNOSUPPORT), N_("Address family not supported by protocol")) +_S(EAFNOSUPPORT, N_("Address family not supported by protocol")) #endif #ifdef EADDRINUSE /* TRANS The requested socket address is already in use. @xref{Socket Addresses}. */ -_S(ERR_MAP(EADDRINUSE), N_("Address already in use")) +_S(EADDRINUSE, N_("Address already in use")) #endif #ifdef EADDRNOTAVAIL /* TRANS The requested socket address is not available; for example, you tried TRANS to give a socket a name that doesn't match the local host name. TRANS @xref{Socket Addresses}. */ -_S(ERR_MAP(EADDRNOTAVAIL), N_("Cannot assign requested address")) +_S(EADDRNOTAVAIL, N_("Cannot assign requested address")) #endif #ifdef ENETDOWN /* TRANS A socket operation failed because the network was down. */ -_S(ERR_MAP(ENETDOWN), N_("Network is down")) +_S(ENETDOWN, N_("Network is down")) #endif #ifdef ENETUNREACH /* TRANS A socket operation failed because the subnet containing the remote host TRANS was unreachable. */ -_S(ERR_MAP(ENETUNREACH), N_("Network is unreachable")) +_S(ENETUNREACH, N_("Network is unreachable")) #endif #ifdef ENETRESET /* TRANS A network connection was reset because the remote host crashed. */ -_S(ERR_MAP(ENETRESET), N_("Network dropped connection on reset")) +_S(ENETRESET, N_("Network dropped connection on reset")) #endif #ifdef ECONNABORTED /* TRANS A network connection was aborted locally. */ -_S(ERR_MAP(ECONNABORTED), N_("Software caused connection abort")) +_S(ECONNABORTED, N_("Software caused connection abort")) #endif #ifdef ECONNRESET /* TRANS A network connection was closed for reasons outside the control of the TRANS local host, such as by the remote machine rebooting or an unrecoverable TRANS protocol violation. */ -_S(ERR_MAP(ECONNRESET), N_("Connection reset by peer")) +_S(ECONNRESET, N_("Connection reset by peer")) #endif #ifdef ENOBUFS /* TRANS The kernel's buffers for I/O operations are all in use. In GNU, this TRANS error is always synonymous with @code{ENOMEM}; you may get one or the TRANS other from network operations. */ -_S(ERR_MAP(ENOBUFS), N_("No buffer space available")) +_S(ENOBUFS, N_("No buffer space available")) #endif #ifdef EISCONN /* TRANS You tried to connect a socket that is already connected. TRANS @xref{Connecting}. */ -_S(ERR_MAP(EISCONN), N_("Transport endpoint is already connected")) +_S(EISCONN, N_("Transport endpoint is already connected")) #endif #ifdef ENOTCONN /* @@ -395,74 +392,74 @@ TRANS The socket is not connected to anything. You get this error when you TRANS try to transmit data over a socket, without first specifying a TRANS destination for the data. For a connectionless socket (for datagram TRANS protocols, such as UDP), you get @code{EDESTADDRREQ} instead. */ -_S(ERR_MAP(ENOTCONN), N_("Transport endpoint is not connected")) +_S(ENOTCONN, N_("Transport endpoint is not connected")) #endif #ifdef EDESTADDRREQ /* TRANS No default destination address was set for the socket. You get this TRANS error when you try to transmit data over a connectionless socket, TRANS without first specifying a destination for the data with @code{connect}. */ -_S(ERR_MAP(EDESTADDRREQ), N_("Destination address required")) +_S(EDESTADDRREQ, N_("Destination address required")) #endif #ifdef ESHUTDOWN /* TRANS The socket has already been shut down. */ -_S(ERR_MAP(ESHUTDOWN), N_("Cannot send after transport endpoint shutdown")) +_S(ESHUTDOWN, N_("Cannot send after transport endpoint shutdown")) #endif #ifdef ETOOMANYREFS -_S(ERR_MAP(ETOOMANYREFS), N_("Too many references: cannot splice")) +_S(ETOOMANYREFS, N_("Too many references: cannot splice")) #endif #ifdef ETIMEDOUT /* TRANS A socket operation with a specified timeout received no response during TRANS the timeout period. */ -_S(ERR_MAP(ETIMEDOUT), N_("Connection timed out")) +_S(ETIMEDOUT, N_("Connection timed out")) #endif #ifdef ECONNREFUSED /* TRANS A remote host refused to allow the network connection (typically because TRANS it is not running the requested service). */ -_S(ERR_MAP(ECONNREFUSED), N_("Connection refused")) +_S(ECONNREFUSED, N_("Connection refused")) #endif #ifdef ELOOP /* TRANS Too many levels of symbolic links were encountered in looking up a file name. TRANS This often indicates a cycle of symbolic links. */ -_S(ERR_MAP(ELOOP), N_("Too many levels of symbolic links")) +_S(ELOOP, N_("Too many levels of symbolic links")) #endif #ifdef ENAMETOOLONG /* TRANS Filename too long (longer than @code{PATH_MAX}; @pxref{Limits for TRANS Files}) or host name too long (in @code{gethostname} or TRANS @code{sethostname}; @pxref{Host Identification}). */ -_S(ERR_MAP(ENAMETOOLONG), N_("File name too long")) +_S(ENAMETOOLONG, N_("File name too long")) #endif #ifdef EHOSTDOWN /* TRANS The remote host for a requested network connection is down. */ -_S(ERR_MAP(EHOSTDOWN), N_("Host is down")) +_S(EHOSTDOWN, N_("Host is down")) #endif /* TRANS The remote host for a requested network connection is not reachable. */ #ifdef EHOSTUNREACH -_S(ERR_MAP(EHOSTUNREACH), N_("No route to host")) +_S(EHOSTUNREACH, N_("No route to host")) #endif #ifdef ENOTEMPTY /* TRANS Directory not empty, where an empty directory was expected. Typically, TRANS this error occurs when you are trying to delete a directory. */ -_S(ERR_MAP(ENOTEMPTY), N_("Directory not empty")) +_S(ENOTEMPTY, N_("Directory not empty")) #endif #ifdef EUSERS /* TRANS The file quota system is confused because there are too many users. TRANS @c This can probably happen in a GNU system when using NFS. */ -_S(ERR_MAP(EUSERS), N_("Too many users")) +_S(EUSERS, N_("Too many users")) #endif #ifdef EDQUOT /* TRANS The user's disk quota was exceeded. */ -_S(ERR_MAP(EDQUOT), N_("Disk quota exceeded")) +_S(EDQUOT, N_("Disk quota exceeded")) #endif #ifdef ESTALE /* @@ -471,7 +468,7 @@ TRANS file system which is due to file system rearrangements on the server host TRANS for NFS file systems or corruption in other file systems. TRANS Repairing this condition usually requires unmounting, possibly repairing TRANS and remounting the file system. */ -_S(ERR_MAP(ESTALE), N_("Stale file handle")) +_S(ESTALE, N_("Stale file handle")) #endif #ifdef EREMOTE /* @@ -479,7 +476,7 @@ TRANS An attempt was made to NFS-mount a remote file system with a file name tha TRANS already specifies an NFS-mounted file. TRANS (This is an error on some operating systems, but we expect it to work TRANS properly on @gnuhurdsystems{}, making this error code impossible.) */ -_S(ERR_MAP(EREMOTE), N_("Object is remote")) +_S(EREMOTE, N_("Object is remote")) #endif #ifdef ENOLCK /* @@ -487,7 +484,7 @@ TRANS This is used by the file locking facilities; see TRANS @ref{File Locks}. This error is never generated by @gnuhurdsystems{}, but TRANS it can result from an operation to an NFS server running another TRANS operating system. */ -_S(ERR_MAP(ENOLCK), N_("No locks available")) +_S(ENOLCK, N_("No locks available")) #endif #ifdef ENOSYS /* @@ -496,46 +493,46 @@ TRANS not implemented at all, either in the C library itself or in the TRANS operating system. When you get this error, you can be sure that this TRANS particular function will always fail with @code{ENOSYS} unless you TRANS install a new version of the C library or the operating system. */ -_S(ERR_MAP(ENOSYS), N_("Function not implemented")) +_S(ENOSYS, N_("Function not implemented")) #endif #ifdef EILSEQ /* TRANS While decoding a multibyte character the function came along an invalid TRANS or an incomplete sequence of bytes or the given wide character is invalid. */ -_S(ERR_MAP(EILSEQ), N_("Invalid or incomplete multibyte or wide character")) +_S(EILSEQ, N_("Invalid or incomplete multibyte or wide character")) #endif #ifdef EBADMSG -_S(ERR_MAP(EBADMSG), N_("Bad message")) +_S(EBADMSG, N_("Bad message")) #endif #ifdef EIDRM -_S(ERR_MAP(EIDRM), N_("Identifier removed")) +_S(EIDRM, N_("Identifier removed")) #endif #ifdef EMULTIHOP -_S(ERR_MAP(EMULTIHOP), N_("Multihop attempted")) +_S(EMULTIHOP, N_("Multihop attempted")) #endif #ifdef ENODATA -_S(ERR_MAP(ENODATA), N_("No data available")) +_S(ENODATA, N_("No data available")) #endif #ifdef ENOLINK -_S(ERR_MAP(ENOLINK), N_("Link has been severed")) +_S(ENOLINK, N_("Link has been severed")) #endif #ifdef ENOMSG -_S(ERR_MAP(ENOMSG), N_("No message of desired type")) +_S(ENOMSG, N_("No message of desired type")) #endif #ifdef ENOSR -_S(ERR_MAP(ENOSR), N_("Out of streams resources")) +_S(ENOSR, N_("Out of streams resources")) #endif #ifdef ENOSTR -_S(ERR_MAP(ENOSTR), N_("Device not a stream")) +_S(ENOSTR, N_("Device not a stream")) #endif #ifdef EOVERFLOW -_S(ERR_MAP(EOVERFLOW), N_("Value too large for defined data type")) +_S(EOVERFLOW, N_("Value too large for defined data type")) #endif #ifdef EPROTO -_S(ERR_MAP(EPROTO), N_("Protocol error")) +_S(EPROTO, N_("Protocol error")) #endif #ifdef ETIME -_S(ERR_MAP(ETIME), N_("Timer expired")) +_S(ETIME, N_("Timer expired")) #endif #ifdef ECANCELED /* @@ -543,148 +540,148 @@ TRANS An asynchronous operation was canceled before it TRANS completed. @xref{Asynchronous I/O}. When you call @code{aio_cancel}, TRANS the normal result is for the operations affected to complete with this TRANS error; @pxref{Cancel AIO Operations}. */ -_S(ERR_MAP(ECANCELED), N_("Operation canceled")) +_S(ECANCELED, N_("Operation canceled")) #endif #ifdef EOWNERDEAD -_S(ERR_MAP(EOWNERDEAD), N_("Owner died")) +_S(EOWNERDEAD, N_("Owner died")) #endif #ifdef ENOTRECOVERABLE -_S(ERR_MAP(ENOTRECOVERABLE), N_("State not recoverable")) +_S(ENOTRECOVERABLE, N_("State not recoverable")) #endif #ifdef ERESTART -_S(ERR_MAP(ERESTART), N_("Interrupted system call should be restarted")) +_S(ERESTART, N_("Interrupted system call should be restarted")) #endif #ifdef ECHRNG -_S(ERR_MAP(ECHRNG), N_("Channel number out of range")) +_S(ECHRNG, N_("Channel number out of range")) #endif #ifdef EL2NSYNC -_S(ERR_MAP(EL2NSYNC), N_("Level 2 not synchronized")) +_S(EL2NSYNC, N_("Level 2 not synchronized")) #endif #ifdef EL3HLT -_S(ERR_MAP(EL3HLT), N_("Level 3 halted")) +_S(EL3HLT, N_("Level 3 halted")) #endif #ifdef EL3RST -_S(ERR_MAP(EL3RST), N_("Level 3 reset")) +_S(EL3RST, N_("Level 3 reset")) #endif #ifdef ELNRNG -_S(ERR_MAP(ELNRNG), N_("Link number out of range")) +_S(ELNRNG, N_("Link number out of range")) #endif #ifdef EUNATCH -_S(ERR_MAP(EUNATCH), N_("Protocol driver not attached")) +_S(EUNATCH, N_("Protocol driver not attached")) #endif #ifdef ENOCSI -_S(ERR_MAP(ENOCSI), N_("No CSI structure available")) +_S(ENOCSI, N_("No CSI structure available")) #endif #ifdef EL2HLT -_S(ERR_MAP(EL2HLT), N_("Level 2 halted")) +_S(EL2HLT, N_("Level 2 halted")) #endif #ifdef EBADE -_S(ERR_MAP(EBADE), N_("Invalid exchange")) +_S(EBADE, N_("Invalid exchange")) #endif #ifdef EBADR -_S(ERR_MAP(EBADR), N_("Invalid request descriptor")) +_S(EBADR, N_("Invalid request descriptor")) #endif #ifdef EXFULL -_S(ERR_MAP(EXFULL), N_("Exchange full")) +_S(EXFULL, N_("Exchange full")) #endif #ifdef ENOANO -_S(ERR_MAP(ENOANO), N_("No anode")) +_S(ENOANO, N_("No anode")) #endif #ifdef EBADRQC -_S(ERR_MAP(EBADRQC), N_("Invalid request code")) +_S(EBADRQC, N_("Invalid request code")) #endif #ifdef EBADSLT -_S(ERR_MAP(EBADSLT), N_("Invalid slot")) +_S(EBADSLT, N_("Invalid slot")) #endif #ifdef EBFONT -_S(ERR_MAP(EBFONT), N_("Bad font file format")) +_S(EBFONT, N_("Bad font file format")) #endif #ifdef ENONET -_S(ERR_MAP(ENONET), N_("Machine is not on the network")) +_S(ENONET, N_("Machine is not on the network")) #endif #ifdef ENOPKG -_S(ERR_MAP(ENOPKG), N_("Package not installed")) +_S(ENOPKG, N_("Package not installed")) #endif #ifdef EADV -_S(ERR_MAP(EADV), N_("Advertise error")) +_S(EADV, N_("Advertise error")) #endif #ifdef ESRMNT -_S(ERR_MAP(ESRMNT), N_("Srmount error")) +_S(ESRMNT, N_("Srmount error")) #endif #ifdef ECOMM -_S(ERR_MAP(ECOMM), N_("Communication error on send")) +_S(ECOMM, N_("Communication error on send")) #endif #ifdef EDOTDOT -_S(ERR_MAP(EDOTDOT), N_("RFS specific error")) +_S(EDOTDOT, N_("RFS specific error")) #endif #ifdef ENOTUNIQ -_S(ERR_MAP(ENOTUNIQ), N_("Name not unique on network")) +_S(ENOTUNIQ, N_("Name not unique on network")) #endif #ifdef EBADFD -_S(ERR_MAP(EBADFD), N_("File descriptor in bad state")) +_S(EBADFD, N_("File descriptor in bad state")) #endif #ifdef EREMCHG -_S(ERR_MAP(EREMCHG), N_("Remote address changed")) +_S(EREMCHG, N_("Remote address changed")) #endif #ifdef ELIBACC -_S(ERR_MAP(ELIBACC), N_("Can not access a needed shared library")) +_S(ELIBACC, N_("Can not access a needed shared library")) #endif #ifdef ELIBBAD -_S(ERR_MAP(ELIBBAD), N_("Accessing a corrupted shared library")) +_S(ELIBBAD, N_("Accessing a corrupted shared library")) #endif #ifdef ELIBSCN -_S(ERR_MAP(ELIBSCN), N_(".lib section in a.out corrupted")) +_S(ELIBSCN, N_(".lib section in a.out corrupted")) #endif #ifdef ELIBMAX -_S(ERR_MAP(ELIBMAX), N_("Attempting to link in too many shared libraries")) +_S(ELIBMAX, N_("Attempting to link in too many shared libraries")) #endif #ifdef ELIBEXEC -_S(ERR_MAP(ELIBEXEC), N_("Cannot exec a shared library directly")) +_S(ELIBEXEC, N_("Cannot exec a shared library directly")) #endif #ifdef ESTRPIPE -_S(ERR_MAP(ESTRPIPE), N_("Streams pipe error")) +_S(ESTRPIPE, N_("Streams pipe error")) #endif #ifdef EUCLEAN -_S(ERR_MAP(EUCLEAN), N_("Structure needs cleaning")) +_S(EUCLEAN, N_("Structure needs cleaning")) #endif #ifdef ENOTNAM -_S(ERR_MAP(ENOTNAM), N_("Not a XENIX named type file")) +_S(ENOTNAM, N_("Not a XENIX named type file")) #endif #ifdef ENAVAIL -_S(ERR_MAP(ENAVAIL), N_("No XENIX semaphores available")) +_S(ENAVAIL, N_("No XENIX semaphores available")) #endif #ifdef EISNAM -_S(ERR_MAP(EISNAM), N_("Is a named type file")) +_S(EISNAM, N_("Is a named type file")) #endif #ifdef EREMOTEIO -_S(ERR_MAP(EREMOTEIO), N_("Remote I/O error")) +_S(EREMOTEIO, N_("Remote I/O error")) #endif #ifdef ENOMEDIUM -_S(ERR_MAP(ENOMEDIUM), N_("No medium found")) +_S(ENOMEDIUM, N_("No medium found")) #endif #ifdef EMEDIUMTYPE -_S(ERR_MAP(EMEDIUMTYPE), N_("Wrong medium type")) +_S(EMEDIUMTYPE, N_("Wrong medium type")) #endif #ifdef ENOKEY -_S(ERR_MAP(ENOKEY), N_("Required key not available")) +_S(ENOKEY, N_("Required key not available")) #endif #ifdef EKEYEXPIRED -_S(ERR_MAP(EKEYEXPIRED), N_("Key has expired")) +_S(EKEYEXPIRED, N_("Key has expired")) #endif #ifdef EKEYREVOKED -_S(ERR_MAP(EKEYREVOKED), N_("Key has been revoked")) +_S(EKEYREVOKED, N_("Key has been revoked")) #endif #ifdef EKEYREJECTED -_S(ERR_MAP(EKEYREJECTED), N_("Key was rejected by service")) +_S(EKEYREJECTED, N_("Key was rejected by service")) #endif #ifdef ERFKILL -_S(ERR_MAP(ERFKILL), N_("Operation not possible due to RF-kill")) +_S(ERFKILL, N_("Operation not possible due to RF-kill")) #endif #ifdef EHWPOISON -_S(ERR_MAP(EHWPOISON), N_("Memory page has hardware error")) +_S(EHWPOISON, N_("Memory page has hardware error")) #endif #ifdef EBADRPC -_S(ERR_MAP(EBADRPC), N_("RPC struct is bad")) +_S(EBADRPC, N_("RPC struct is bad")) #endif #ifdef EFTYPE /* @@ -693,40 +690,40 @@ TRANS operation, or a data file had the wrong format. TRANS TRANS On some systems @code{chmod} returns this error if you try to set the TRANS sticky bit on a non-directory file; @pxref{Setting Permissions}. */ -_S(ERR_MAP(EFTYPE), N_("Inappropriate file type or format")) +_S(EFTYPE, N_("Inappropriate file type or format")) #endif #ifdef EPROCUNAVAIL -_S(ERR_MAP(EPROCUNAVAIL), N_("RPC bad procedure for program")) +_S(EPROCUNAVAIL, N_("RPC bad procedure for program")) #endif #ifdef EAUTH -_S(ERR_MAP(EAUTH), N_("Authentication error")) +_S(EAUTH, N_("Authentication error")) #endif #ifdef EDIED /* TRANS On @gnuhurdsystems{}, opening a file returns this error when the file is TRANS translated by a program and the translator program dies while starting TRANS up, before it has connected to the file. */ -_S(ERR_MAP(EDIED), N_("Translator died")) +_S(EDIED, N_("Translator died")) #endif #ifdef ERPCMISMATCH -_S(ERR_MAP(ERPCMISMATCH), N_("RPC version wrong")) +_S(ERPCMISMATCH, N_("RPC version wrong")) #endif #ifdef EGREGIOUS /* TRANS You did @strong{what}? */ -_S(ERR_MAP(EGREGIOUS), N_("You really blew it this time")) +_S(EGREGIOUS, N_("You really blew it this time")) #endif #ifdef EPROCLIM /* TRANS This means that the per-user limit on new process would be exceeded by TRANS an attempted @code{fork}. @xref{Limits on Resources}, for details on TRANS the @code{RLIMIT_NPROC} limit. */ -_S(ERR_MAP(EPROCLIM), N_("Too many processes")) +_S(EPROCLIM, N_("Too many processes")) #endif #ifdef EGRATUITOUS /* TRANS This error code has no purpose. */ -_S(ERR_MAP(EGRATUITOUS), N_("Gratuitous error")) +_S(EGRATUITOUS, N_("Gratuitous error")) #endif #if defined (ENOTSUP) && ENOTSUP != EOPNOTSUPP /* @@ -742,10 +739,10 @@ TRANS values. TRANS TRANS If the entire function is not available at all in the implementation, TRANS it returns @code{ENOSYS} instead. */ -_S(ERR_MAP(ENOTSUP), N_("Not supported")) +_S(ENOTSUP, N_("Not supported")) #endif #ifdef EPROGMISMATCH -_S(ERR_MAP(EPROGMISMATCH), N_("RPC program version wrong")) +_S(EPROGMISMATCH, N_("RPC program version wrong")) #endif #ifdef EBACKGROUND /* @@ -755,7 +752,7 @@ TRANS foreground process group of the terminal. Users do not usually see this TRANS error because functions such as @code{read} and @code{write} translate TRANS it into a @code{SIGTTIN} or @code{SIGTTOU} signal. @xref{Job Control}, TRANS for information on process groups and these signals. */ -_S(ERR_MAP(EBACKGROUND), N_("Inappropriate operation for background process")) +_S(EBACKGROUND, N_("Inappropriate operation for background process")) #endif #ifdef EIEIO /* @@ -773,7 +770,7 @@ TRANS @c "bought the farm" means "died". -jtobey TRANS @c TRANS @c Translators, please do not translate this litteraly, translate it into TRANS @c an idiomatic funny way of saying that the computer died. */ -_S(ERR_MAP(EIEIO), N_("Computer bought the farm")) +_S(EIEIO, N_("Computer bought the farm")) #endif #if defined (EWOULDBLOCK) && EWOULDBLOCK != EAGAIN /* @@ -782,18 +779,18 @@ TRANS The values are always the same, on every operating system. TRANS TRANS C libraries in many older Unix systems have @code{EWOULDBLOCK} as a TRANS separate error code. */ -_S(ERR_MAP(EWOULDBLOCK), N_("Operation would block")) +_S(EWOULDBLOCK, N_("Operation would block")) #endif #ifdef ENEEDAUTH -_S(ERR_MAP(ENEEDAUTH), N_("Need authenticator")) +_S(ENEEDAUTH, N_("Need authenticator")) #endif #ifdef ED /* TRANS The experienced user will know what is wrong. TRANS @c This error code is a joke. Its perror text is part of the joke. TRANS @c Don't change it. */ -_S(ERR_MAP(ED), N_("?")) +_S(ED, N_("?")) #endif #ifdef EPROGUNAVAIL -_S(ERR_MAP(EPROGUNAVAIL), N_("RPC program not available")) +_S(EPROGUNAVAIL, N_("RPC program not available")) #endif diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h index 0f08079e48..672d8f27ce 100644 --- a/sysdeps/i386/dl-machine.h +++ b/sysdeps/i386/dl-machine.h @@ -338,16 +338,22 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc, { # ifndef RTLD_BOOTSTRAP if (sym_map != map - && sym_map->l_type != lt_executable && !sym_map->l_relocated) { const char *strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]); - _dl_error_printf ("\ + if (sym_map->l_type == lt_executable) + _dl_fatal_printf ("\ +%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \ +and creates an unsatisfiable circular dependency.\n", + RTLD_PROGNAME, strtab + refsym->st_name, + map->l_name); + else + _dl_error_printf ("\ %s: Relink `%s' with `%s' for IFUNC symbol `%s'\n", - RTLD_PROGNAME, map->l_name, - sym_map->l_name, - strtab + refsym->st_name); + RTLD_PROGNAME, map->l_name, + sym_map->l_name, + strtab + refsym->st_name); } # endif value = ((Elf32_Addr (*) (void)) value) (); diff --git a/sysdeps/powerpc/powerpc64/backtrace.c b/sysdeps/powerpc/powerpc64/backtrace.c index 8a53a1088f..362a2b713c 100644 --- a/sysdeps/powerpc/powerpc64/backtrace.c +++ b/sysdeps/powerpc/powerpc64/backtrace.c @@ -54,11 +54,22 @@ struct signal_frame_64 { /* We don't care about the rest, since the IP value is at 'uc' field. */ }; +/* Test if the address match to the inside the trampoline code. + Up to and including kernel 5.8, returning from an interrupt or syscall to a + signal handler starts execution directly at the handler's entry point, with + LR set to address of the sigreturn trampoline (the vDSO symbol). + Newer kernels will branch to signal handler from the trampoline instead, so + checking the stacktrace against the vDSO entrypoint does not work in such + case. + The vDSO branches with a 'bctrl' instruction, so checking either the + vDSO address itself and the next instruction should cover all kernel + versions. */ static inline bool is_sigtramp_address (void *nip) { #ifdef HAVE_SIGTRAMP_RT64 - if (nip == GLRO (dl_vdso_sigtramp_rt64)) + if (nip == GLRO (dl_vdso_sigtramp_rt64) || + nip == GLRO (dl_vdso_sigtramp_rt64) + 4) return true; #endif return false; diff --git a/sysdeps/pthread/Makefile b/sysdeps/pthread/Makefile index 920d875420..bf9b7f7223 100644 --- a/sysdeps/pthread/Makefile +++ b/sysdeps/pthread/Makefile @@ -107,6 +107,7 @@ tests += tst-cnd-basic tst-mtx-trylock tst-cnd-broadcast \ tst-unload \ tst-unwind-thread \ tst-pt-vfork1 tst-pt-vfork2 tst-vfork1x tst-vfork2x \ + tst-pthread-exit-signal \ # Files which must not be linked with libpthread. diff --git a/sysdeps/pthread/tst-pthread-exit-signal.c b/sysdeps/pthread/tst-pthread-exit-signal.c new file mode 100644 index 0000000000..b4526fe663 --- /dev/null +++ b/sysdeps/pthread/tst-pthread-exit-signal.c @@ -0,0 +1,45 @@ +/* Test that pending signals are not delivered on thread exit (bug 28607). + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Due to bug 28607, pthread_kill (or pthread_cancel) restored the + signal mask during during thread exit, triggering the delivery of a + blocked pending signal (SIGUSR1 in this test). */ + +#include +#include + +static void * +threadfunc (void *closure) +{ + sigset_t sigmask; + sigfillset (&sigmask); + xpthread_sigmask (SIG_SETMASK, &sigmask, NULL); + xpthread_kill (pthread_self (), SIGUSR1); + pthread_exit (NULL); + return NULL; +} + +static int +do_test (void) +{ + pthread_t thr = xpthread_create (NULL, threadfunc, NULL); + xpthread_join (thr); + return 0; +} + +#include diff --git a/sysdeps/s390/configure b/sysdeps/s390/configure index fa46e9e351..e7f576338d 100644 --- a/sysdeps/s390/configure +++ b/sysdeps/s390/configure @@ -123,7 +123,9 @@ void testinsn (char *buf) __asm__ (".machine \"arch13\" \n\t" ".machinemode \"zarch_nohighgprs\" \n\t" "lghi %%r0,16 \n\t" - "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0"); + "mvcrl 0(%0),32(%0) \n\t" + "vstrs %%v20,%%v20,%%v20,%%v20,0,2" + : : "a" (buf) : "memory", "r0"); } EOF if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c @@ -271,7 +273,9 @@ else void testinsn (char *buf) { __asm__ ("lghi %%r0,16 \n\t" - "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0"); + "mvcrl 0(%0),32(%0) \n\t" + "vstrs %%v20,%%v20,%%v20,%%v20,0,2" + : : "a" (buf) : "memory", "r0"); } EOF if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c diff --git a/sysdeps/s390/configure.ac b/sysdeps/s390/configure.ac index 3ed5a8ef87..5c3479e8cf 100644 --- a/sysdeps/s390/configure.ac +++ b/sysdeps/s390/configure.ac @@ -88,7 +88,9 @@ void testinsn (char *buf) __asm__ (".machine \"arch13\" \n\t" ".machinemode \"zarch_nohighgprs\" \n\t" "lghi %%r0,16 \n\t" - "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0"); + "mvcrl 0(%0),32(%0) \n\t" + "vstrs %%v20,%%v20,%%v20,%%v20,0,2" + : : "a" (buf) : "memory", "r0"); } EOF dnl test, if assembler supports S390 arch13 instructions @@ -195,7 +197,9 @@ cat > conftest.c <<\EOF void testinsn (char *buf) { __asm__ ("lghi %%r0,16 \n\t" - "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0"); + "mvcrl 0(%0),32(%0) \n\t" + "vstrs %%v20,%%v20,%%v20,%%v20,0,2" + : : "a" (buf) : "memory", "r0"); } EOF dnl test, if assembler supports S390 arch13 zarch instructions as default diff --git a/sysdeps/s390/memmove.c b/sysdeps/s390/memmove.c index 5fc85e129f..ee59b5de14 100644 --- a/sysdeps/s390/memmove.c +++ b/sysdeps/s390/memmove.c @@ -43,7 +43,7 @@ extern __typeof (__redirect_memmove) MEMMOVE_ARCH13 attribute_hidden; s390_libc_ifunc_expr (__redirect_memmove, memmove, ({ s390_libc_ifunc_expr_stfle_init (); - (HAVE_MEMMOVE_ARCH13 + (HAVE_MEMMOVE_ARCH13 && (hwcap & HWCAP_S390_VXRS_EXT2) && S390_IS_ARCH13_MIE3 (stfle_bits)) ? MEMMOVE_ARCH13 : (HAVE_MEMMOVE_Z13 && (hwcap & HWCAP_S390_VX)) diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c index e6195c6e26..17c0cc3952 100644 --- a/sysdeps/s390/multiarch/ifunc-impl-list.c +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c @@ -171,7 +171,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, memmove, # if HAVE_MEMMOVE_ARCH13 IFUNC_IMPL_ADD (array, i, memmove, - S390_IS_ARCH13_MIE3 (stfle_bits), + ((dl_hwcap & HWCAP_S390_VXRS_EXT2) + && S390_IS_ARCH13_MIE3 (stfle_bits)), MEMMOVE_ARCH13) # endif # if HAVE_MEMMOVE_Z13 diff --git a/sysdeps/sh/be/sh4/fpu/Implies b/sysdeps/sh/be/sh4/fpu/Implies new file mode 100644 index 0000000000..71b28ee1a4 --- /dev/null +++ b/sysdeps/sh/be/sh4/fpu/Implies @@ -0,0 +1 @@ +sh/sh4/fpu diff --git a/sysdeps/sh/le/sh4/fpu/Implies b/sysdeps/sh/le/sh4/fpu/Implies new file mode 100644 index 0000000000..71b28ee1a4 --- /dev/null +++ b/sysdeps/sh/le/sh4/fpu/Implies @@ -0,0 +1 @@ +sh/sh4/fpu diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile index 9b2a253032..34748ffcd1 100644 --- a/sysdeps/unix/sysv/linux/Makefile +++ b/sysdeps/unix/sysv/linux/Makefile @@ -100,7 +100,7 @@ tests += tst-clone tst-clone2 tst-clone3 tst-fanotify tst-personality \ tst-quota tst-sync_file_range tst-sysconf-iov_max tst-ttyname \ test-errno-linux tst-memfd_create tst-mlock2 tst-pkey \ tst-rlimit-infinity tst-ofdlocks tst-gettid tst-gettid-kill \ - tst-tgkill + tst-tgkill tst-sysvsem-linux tst-sysvmsg-linux tst-sysvshm-linux tests-internal += tst-ofdlocks-compat tst-sigcontext-get_pc CFLAGS-tst-sigcontext-get_pc.c = -fasynchronous-unwind-tables diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h index fc688450ee..00a4d0c8e7 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h @@ -54,6 +54,10 @@ && MIDR_PARTNUM(midr) == 0x000) #define IS_NEOVERSE_N1(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ && MIDR_PARTNUM(midr) == 0xd0c) +#define IS_NEOVERSE_N2(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ + && MIDR_PARTNUM(midr) == 0xd49) +#define IS_NEOVERSE_V1(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ + && MIDR_PARTNUM(midr) == 0xd40) #define IS_EMAG(midr) (MIDR_IMPLEMENTOR(midr) == 'P' \ && MIDR_PARTNUM(midr) == 0x000) diff --git a/sysdeps/unix/sysv/linux/mq_notify.c b/sysdeps/unix/sysv/linux/mq_notify.c index 61bbb03b64..2bb98172c8 100644 --- a/sysdeps/unix/sysv/linux/mq_notify.c +++ b/sysdeps/unix/sysv/linux/mq_notify.c @@ -132,9 +132,12 @@ helper_thread (void *arg) to wait until it is done with it. */ (void) __pthread_barrier_wait (¬ify_barrier); } - else if (data.raw[NOTIFY_COOKIE_LEN - 1] == NOTIFY_REMOVED) - /* The only state we keep is the copy of the thread attributes. */ - free (data.attr); + else if (data.raw[NOTIFY_COOKIE_LEN - 1] == NOTIFY_REMOVED && data.attr != NULL) + { + /* The only state we keep is the copy of the thread attributes. */ + pthread_attr_destroy (data.attr); + free (data.attr); + } } return NULL; } @@ -255,8 +258,14 @@ mq_notify (mqd_t mqdes, const struct sigevent *notification) if (data.attr == NULL) return -1; - memcpy (data.attr, notification->sigev_notify_attributes, - sizeof (pthread_attr_t)); + int ret = __pthread_attr_copy (data.attr, + notification->sigev_notify_attributes); + if (ret != 0) + { + free (data.attr); + __set_errno (ret); + return -1; + } } /* Construct the new request. */ @@ -269,8 +278,11 @@ mq_notify (mqd_t mqdes, const struct sigevent *notification) int retval = INLINE_SYSCALL (mq_notify, 2, mqdes, &se); /* If it failed, free the allocated memory. */ - if (__glibc_unlikely (retval != 0)) - free (data.attr); + if (retval != 0 && data.attr != NULL) + { + pthread_attr_destroy (data.attr); + free (data.attr); + } return retval; } diff --git a/sysdeps/unix/sysv/linux/msgctl.c b/sysdeps/unix/sysv/linux/msgctl.c index 0776472d5e..a1f24ab242 100644 --- a/sysdeps/unix/sysv/linux/msgctl.c +++ b/sysdeps/unix/sysv/linux/msgctl.c @@ -90,8 +90,15 @@ __msgctl64 (int msqid, int cmd, struct __msqid64_ds *buf) struct kernel_msqid64_ds ksemid, *arg = NULL; if (buf != NULL) { - msqid64_to_kmsqid64 (buf, &ksemid); - arg = &ksemid; + /* This is a Linux extension where kernel returns a 'struct msginfo' + instead. */ + if (cmd == IPC_INFO || cmd == MSG_INFO) + arg = (struct kernel_msqid64_ds *) buf; + else + { + msqid64_to_kmsqid64 (buf, &ksemid); + arg = &ksemid; + } } # ifdef __ASSUME_SYSVIPC_BROKEN_MODE_T if (cmd == IPC_SET) @@ -169,8 +176,15 @@ __msgctl (int msqid, int cmd, struct msqid_ds *buf) struct __msqid64_ds msqid64, *buf64 = NULL; if (buf != NULL) { - msqid_to_msqid64 (&msqid64, buf); - buf64 = &msqid64; + /* This is a Linux extension where kernel returns a 'struct msginfo' + instead. */ + if (cmd == IPC_INFO || cmd == MSG_INFO) + buf64 = (struct __msqid64_ds *) buf; + else + { + msqid_to_msqid64 (&msqid64, buf); + buf64 = &msqid64; + } } int ret = __msgctl64 (msqid, cmd, buf64); diff --git a/sysdeps/unix/sysv/linux/semctl.c b/sysdeps/unix/sysv/linux/semctl.c index f131a26fc7..1cdabde8f2 100644 --- a/sysdeps/unix/sysv/linux/semctl.c +++ b/sysdeps/unix/sysv/linux/semctl.c @@ -102,6 +102,7 @@ semun64_to_ksemun64 (int cmd, union semun64 semun64, r.array = semun64.array; break; case SEM_STAT: + case SEM_STAT_ANY: case IPC_STAT: case IPC_SET: r.buf = buf; @@ -150,6 +151,7 @@ __semctl64 (int semid, int semnum, int cmd, ...) case IPC_STAT: /* arg.buf */ case IPC_SET: case SEM_STAT: + case SEM_STAT_ANY: case IPC_INFO: /* arg.__buf */ case SEM_INFO: va_start (ap, cmd); @@ -238,6 +240,7 @@ semun_to_semun64 (int cmd, union semun semun, struct __semid64_ds *semid64) r.array = semun.array; break; case SEM_STAT: + case SEM_STAT_ANY: case IPC_STAT: case IPC_SET: r.buf = semid64; @@ -267,6 +270,7 @@ __semctl (int semid, int semnum, int cmd, ...) case IPC_STAT: /* arg.buf */ case IPC_SET: case SEM_STAT: + case SEM_STAT_ANY: case IPC_INFO: /* arg.__buf */ case SEM_INFO: va_start (ap, cmd); @@ -321,6 +325,7 @@ __semctl_mode16 (int semid, int semnum, int cmd, ...) case IPC_STAT: /* arg.buf */ case IPC_SET: case SEM_STAT: + case SEM_STAT_ANY: case IPC_INFO: /* arg.__buf */ case SEM_INFO: va_start (ap, cmd); @@ -354,6 +359,7 @@ __old_semctl (int semid, int semnum, int cmd, ...) case IPC_STAT: /* arg.buf */ case IPC_SET: case SEM_STAT: + case SEM_STAT_ANY: case IPC_INFO: /* arg.__buf */ case SEM_INFO: va_start (ap, cmd); diff --git a/sysdeps/unix/sysv/linux/sh/be/sh4/fpu/Implies b/sysdeps/unix/sysv/linux/sh/be/sh4/fpu/Implies new file mode 100644 index 0000000000..7eeaf15a5a --- /dev/null +++ b/sysdeps/unix/sysv/linux/sh/be/sh4/fpu/Implies @@ -0,0 +1 @@ +unix/sysv/linux/sh/sh4/fpu diff --git a/sysdeps/unix/sysv/linux/sh/le/sh4/fpu/Implies b/sysdeps/unix/sysv/linux/sh/le/sh4/fpu/Implies new file mode 100644 index 0000000000..7eeaf15a5a --- /dev/null +++ b/sysdeps/unix/sysv/linux/sh/le/sh4/fpu/Implies @@ -0,0 +1 @@ +unix/sysv/linux/sh/sh4/fpu diff --git a/sysdeps/unix/sysv/linux/shmctl.c b/sysdeps/unix/sysv/linux/shmctl.c index 76d88441f1..1d19a798b1 100644 --- a/sysdeps/unix/sysv/linux/shmctl.c +++ b/sysdeps/unix/sysv/linux/shmctl.c @@ -90,8 +90,15 @@ __shmctl64 (int shmid, int cmd, struct __shmid64_ds *buf) struct kernel_shmid64_ds kshmid, *arg = NULL; if (buf != NULL) { - shmid64_to_kshmid64 (buf, &kshmid); - arg = &kshmid; + /* This is a Linux extension where kernel expects either a + 'struct shminfo' (IPC_INFO) or 'struct shm_info' (SHM_INFO). */ + if (cmd == IPC_INFO || cmd == SHM_INFO) + arg = (struct kernel_shmid64_ds *) buf; + else + { + shmid64_to_kshmid64 (buf, &kshmid); + arg = &kshmid; + } } # ifdef __ASSUME_SYSVIPC_BROKEN_MODE_T if (cmd == IPC_SET) @@ -107,7 +114,6 @@ __shmctl64 (int shmid, int cmd, struct __shmid64_ds *buf) switch (cmd) { - case IPC_INFO: case IPC_STAT: case SHM_STAT: case SHM_STAT_ANY: @@ -168,8 +174,15 @@ __shmctl (int shmid, int cmd, struct shmid_ds *buf) struct __shmid64_ds shmid64, *buf64 = NULL; if (buf != NULL) { - shmid_to_shmid64 (&shmid64, buf); - buf64 = &shmid64; + /* This is a Linux extension where kernel expects either a + 'struct shminfo' (IPC_INFO) or 'struct shm_info' (SHM_INFO). */ + if (cmd == IPC_INFO || cmd == SHM_INFO) + buf64 = (struct __shmid64_ds *) buf; + else + { + shmid_to_shmid64 (&shmid64, buf); + buf64 = &shmid64; + } } int ret = __shmctl64 (shmid, cmd, buf64); @@ -178,7 +191,6 @@ __shmctl (int shmid, int cmd, struct shmid_ds *buf) switch (cmd) { - case IPC_INFO: case IPC_STAT: case SHM_STAT: case SHM_STAT_ANY: diff --git a/sysdeps/unix/sysv/linux/tst-sysvmsg-linux.c b/sysdeps/unix/sysv/linux/tst-sysvmsg-linux.c new file mode 100644 index 0000000000..630f4f792c --- /dev/null +++ b/sysdeps/unix/sysv/linux/tst-sysvmsg-linux.c @@ -0,0 +1,177 @@ +/* Basic tests for Linux SYSV message queue extensions. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define MSGQ_MODE 0644 + +/* These are for the temporary file we generate. */ +static char *name; +static int msqid; + +static void +remove_msq (void) +{ + /* Enforce message queue removal in case of early test failure. + Ignore error since the msg may already have being removed. */ + msgctl (msqid, IPC_RMID, NULL); +} + +static void +do_prepare (int argc, char *argv[]) +{ + TEST_VERIFY_EXIT (create_temp_file ("tst-sysvmsg.", &name) != -1); +} + +#define PREPARE do_prepare + +struct test_msginfo +{ + int msgmax; + int msgmnb; + int msgmni; +}; + +/* It tries to obtain some system-wide SysV messsage queue information from + /proc to check against IPC_INFO/MSG_INFO. The /proc only returns the + tunables value of MSGMAX, MSGMNB, and MSGMNI. + + The kernel also returns constant value for MSGSSZ, MSGSEG and also MSGMAP, + MSGPOOL, and MSGTQL (for IPC_INFO). The issue to check them is they might + change over kernel releases. */ + +static int +read_proc_file (const char *file) +{ + FILE *f = fopen (file, "r"); + if (f == NULL) + FAIL_UNSUPPORTED ("/proc is not mounted or %s is not available", file); + + int v; + int r = fscanf (f, "%d", & v); + TEST_VERIFY_EXIT (r == 1); + + fclose (f); + return v; +} + + +/* Check if the message queue with IDX (index into the kernel's internal + array) matches the one with KEY. The CMD is either MSG_STAT or + MSG_STAT_ANY. */ + +static bool +check_msginfo (int idx, key_t key, int cmd) +{ + struct msqid_ds msginfo; + int mid = msgctl (idx, cmd, &msginfo); + /* Ignore unused array slot returned by the kernel or information from + unknown message queue. */ + if ((mid == -1 && errno == EINVAL) || mid != msqid) + return false; + + if (mid == -1) + FAIL_EXIT1 ("msgctl with %s failed: %m", + cmd == MSG_STAT ? "MSG_STAT" : "MSG_STAT_ANY"); + + TEST_COMPARE (msginfo.msg_perm.__key, key); + TEST_COMPARE (msginfo.msg_perm.mode, MSGQ_MODE); + TEST_COMPARE (msginfo.msg_qnum, 0); + + return true; +} + +static int +do_test (void) +{ + atexit (remove_msq); + + key_t key = ftok (name, 'G'); + if (key == -1) + FAIL_EXIT1 ("ftok failed: %m"); + + msqid = msgget (key, MSGQ_MODE | IPC_CREAT); + if (msqid == -1) + FAIL_EXIT1 ("msgget failed: %m"); + + struct test_msginfo tipcinfo; + tipcinfo.msgmax = read_proc_file ("/proc/sys/kernel/msgmax"); + tipcinfo.msgmnb = read_proc_file ("/proc/sys/kernel/msgmnb"); + tipcinfo.msgmni = read_proc_file ("/proc/sys/kernel/msgmni"); + + int msqidx; + + { + struct msginfo ipcinfo; + msqidx = msgctl (msqid, IPC_INFO, (struct msqid_ds *) &ipcinfo); + if (msqidx == -1) + FAIL_EXIT1 ("msgctl with IPC_INFO failed: %m"); + + TEST_COMPARE (ipcinfo.msgmax, tipcinfo.msgmax); + TEST_COMPARE (ipcinfo.msgmnb, tipcinfo.msgmnb); + TEST_COMPARE (ipcinfo.msgmni, tipcinfo.msgmni); + } + + /* Same as before but with MSG_INFO. */ + { + struct msginfo ipcinfo; + msqidx = msgctl (msqid, MSG_INFO, (struct msqid_ds *) &ipcinfo); + if (msqidx == -1) + FAIL_EXIT1 ("msgctl with IPC_INFO failed: %m"); + + TEST_COMPARE (ipcinfo.msgmax, tipcinfo.msgmax); + TEST_COMPARE (ipcinfo.msgmnb, tipcinfo.msgmnb); + TEST_COMPARE (ipcinfo.msgmni, tipcinfo.msgmni); + } + + /* We check if the created message queue shows in global list. */ + bool found = false; + for (int i = 0; i <= msqidx; i++) + { + /* We can't tell apart if MSG_STAT_ANY is not supported (kernel older + than 4.17) or if the index used is invalid. So it just check if the + value returned from a valid call matches the created message + queue. */ + check_msginfo (i, key, MSG_STAT_ANY); + + if (check_msginfo (i, key, MSG_STAT)) + { + found = true; + break; + } + } + + if (!found) + FAIL_EXIT1 ("msgctl with MSG_STAT/MSG_STAT_ANY could not find the " + "created message queue"); + + if (msgctl (msqid, IPC_RMID, NULL) == -1) + FAIL_EXIT1 ("msgctl failed"); + + return 0; +} + +#include diff --git a/sysdeps/unix/sysv/linux/tst-sysvsem-linux.c b/sysdeps/unix/sysv/linux/tst-sysvsem-linux.c new file mode 100644 index 0000000000..45f19e2d37 --- /dev/null +++ b/sysdeps/unix/sysv/linux/tst-sysvsem-linux.c @@ -0,0 +1,184 @@ +/* Basic tests for Linux SYSV semaphore extensions. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* These are for the temporary file we generate. */ +static char *name; +static int semid; + +static void +remove_sem (void) +{ + /* Enforce message queue removal in case of early test failure. + Ignore error since the sem may already have being removed. */ + semctl (semid, 0, IPC_RMID, 0); +} + +static void +do_prepare (int argc, char *argv[]) +{ + TEST_VERIFY_EXIT (create_temp_file ("tst-sysvsem.", &name) != -1); +} + +#define PREPARE do_prepare + +#define SEM_MODE 0644 + +union semun +{ + int val; + struct semid_ds *buf; + unsigned short *array; + struct seminfo *__buf; +}; + +struct test_seminfo +{ + int semmsl; + int semmns; + int semopm; + int semmni; +}; + +/* It tries to obtain some system-wide SysV semaphore information from /proc + to check against IPC_INFO/SEM_INFO. The /proc only returns the tunables + value of SEMMSL, SEMMNS, SEMOPM, and SEMMNI. + + The kernel also returns constant value for SEMVMX, SEMMNU, SEMMAP, SEMUME, + and also SEMUSZ and SEMAEM (for IPC_INFO). The issue to check them is they + might change over kernel releases. */ + +static void +read_sem_stat (struct test_seminfo *tseminfo) +{ + FILE *f = fopen ("/proc/sys/kernel/sem", "r"); + if (f == NULL) + FAIL_UNSUPPORTED ("/proc is not mounted or /proc/sys/kernel/sem is not " + "available"); + + int r = fscanf (f, "%d %d %d %d", + &tseminfo->semmsl, &tseminfo->semmns, &tseminfo->semopm, + &tseminfo->semmni); + TEST_VERIFY_EXIT (r == 4); + + fclose (f); +} + + +/* Check if the semaphore with IDX (index into the kernel's internal array) + matches the one with KEY. The CMD is either SEM_STAT or SEM_STAT_ANY. */ + +static bool +check_seminfo (int idx, key_t key, int cmd) +{ + struct semid_ds seminfo; + int sid = semctl (idx, 0, cmd, (union semun) { .buf = &seminfo }); + /* Ignore unused array slot returned by the kernel or information from + unknown semaphores. */ + if ((sid == -1 && errno == EINVAL) || sid != semid) + return false; + + if (sid == -1) + FAIL_EXIT1 ("semctl with SEM_STAT failed (errno=%d)", errno); + + TEST_COMPARE (seminfo.sem_perm.__key, key); + TEST_COMPARE (seminfo.sem_perm.mode, SEM_MODE); + TEST_COMPARE (seminfo.sem_nsems, 1); + + return true; +} + +static int +do_test (void) +{ + atexit (remove_sem); + + key_t key = ftok (name, 'G'); + if (key == -1) + FAIL_EXIT1 ("ftok failed: %m"); + + semid = semget (key, 1, IPC_CREAT | IPC_EXCL | SEM_MODE); + if (semid == -1) + FAIL_EXIT1 ("semget failed: %m"); + + struct test_seminfo tipcinfo; + read_sem_stat (&tipcinfo); + + int semidx; + + { + struct seminfo ipcinfo; + semidx = semctl (semid, 0, IPC_INFO, (union semun) { .__buf = &ipcinfo }); + if (semidx == -1) + FAIL_EXIT1 ("semctl with IPC_INFO failed: %m"); + + TEST_COMPARE (ipcinfo.semmsl, tipcinfo.semmsl); + TEST_COMPARE (ipcinfo.semmns, tipcinfo.semmns); + TEST_COMPARE (ipcinfo.semopm, tipcinfo.semopm); + TEST_COMPARE (ipcinfo.semmni, tipcinfo.semmni); + } + + /* Same as before but with SEM_INFO. */ + { + struct seminfo ipcinfo; + semidx = semctl (semid, 0, SEM_INFO, (union semun) { .__buf = &ipcinfo }); + if (semidx == -1) + FAIL_EXIT1 ("semctl with IPC_INFO failed: %m"); + + TEST_COMPARE (ipcinfo.semmsl, tipcinfo.semmsl); + TEST_COMPARE (ipcinfo.semmns, tipcinfo.semmns); + TEST_COMPARE (ipcinfo.semopm, tipcinfo.semopm); + TEST_COMPARE (ipcinfo.semmni, tipcinfo.semmni); + } + + /* We check if the created semaphore shows in the system-wide status. */ + bool found = false; + for (int i = 0; i <= semidx; i++) + { + /* We can't tell apart if SEM_STAT_ANY is not supported (kernel older + than 4.17) or if the index used is invalid. So it just check if + value returned from a valid call matches the created semaphore. */ + check_seminfo (i, key, SEM_STAT_ANY); + + if (check_seminfo (i, key, SEM_STAT)) + { + found = true; + break; + } + } + + if (!found) + FAIL_EXIT1 ("semctl with SEM_STAT/SEM_STAT_ANY could not find the " + "created semaphore"); + + if (semctl (semid, 0, IPC_RMID, 0) == -1) + FAIL_EXIT1 ("semctl failed: %m"); + + return 0; +} + +#include diff --git a/sysdeps/unix/sysv/linux/tst-sysvshm-linux.c b/sysdeps/unix/sysv/linux/tst-sysvshm-linux.c new file mode 100644 index 0000000000..bb154592a6 --- /dev/null +++ b/sysdeps/unix/sysv/linux/tst-sysvshm-linux.c @@ -0,0 +1,188 @@ +/* Basic tests for Linux SYSV shared memory extensions. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SHM_MODE 0644 + +/* These are for the temporary file we generate. */ +static char *name; +static int shmid; +static long int pgsz; + +static void +remove_shm (void) +{ + /* Enforce message queue removal in case of early test failure. + Ignore error since the shm may already have being removed. */ + shmctl (shmid, IPC_RMID, NULL); +} + +static void +do_prepare (int argc, char *argv[]) +{ + TEST_VERIFY_EXIT (create_temp_file ("tst-sysvshm.", &name) != -1); +} + +#define PREPARE do_prepare + +struct test_shminfo +{ + __syscall_ulong_t shmall; + __syscall_ulong_t shmmax; + __syscall_ulong_t shmmni; +}; + +/* It tries to obtain some system-wide SysV shared memory information from + /proc to check against IPC_INFO/SHM_INFO. The /proc only returns the + tunables value of SHMALL, SHMMAX, and SHMMNI. */ + +static uint64_t +read_proc_file (const char *file) +{ + FILE *f = fopen (file, "r"); + if (f == NULL) + FAIL_UNSUPPORTED ("/proc is not mounted or %s is not available", file); + + /* Handle 32-bit binaries running on 64-bit kernels. */ + uint64_t v; + int r = fscanf (f, "%" SCNu64, &v); + TEST_VERIFY_EXIT (r == 1); + + fclose (f); + return v; +} + + +/* Check if the message queue with IDX (index into the kernel's internal + array) matches the one with KEY. The CMD is either SHM_STAT or + SHM_STAT_ANY. */ + +static bool +check_shminfo (int idx, key_t key, int cmd) +{ + struct shmid_ds shminfo; + int sid = shmctl (idx, cmd, &shminfo); + /* Ignore unused array slot returned by the kernel or information from + unknown message queue. */ + if ((sid == -1 && errno == EINVAL) || sid != shmid) + return false; + + if (sid == -1) + FAIL_EXIT1 ("shmctl with %s failed: %m", + cmd == SHM_STAT ? "SHM_STAT" : "SHM_STAT_ANY"); + + TEST_COMPARE (shminfo.shm_perm.__key, key); + TEST_COMPARE (shminfo.shm_perm.mode, SHM_MODE); + TEST_COMPARE (shminfo.shm_segsz, pgsz); + + return true; +} + +static int +do_test (void) +{ + atexit (remove_shm); + + pgsz = sysconf (_SC_PAGESIZE); + if (pgsz == -1) + FAIL_EXIT1 ("sysconf (_SC_PAGESIZE) failed: %m"); + + key_t key = ftok (name, 'G'); + if (key == -1) + FAIL_EXIT1 ("ftok failed: %m"); + + shmid = shmget (key, pgsz, IPC_CREAT | IPC_EXCL | SHM_MODE); + if (shmid == -1) + FAIL_EXIT1 ("shmget failed: %m"); + + /* It does not check shmmax because kernel clamp its value to INT_MAX for: + + 1. Compat symbols with IPC_64, i.e, 32-bit binaries running on 64-bit + kernels. + + 2. Default symbol without IPC_64 (defined as IPC_OLD within Linux) and + glibc always use IPC_64 for 32-bit ABIs (to support 64-bit time_t). + It means that 32-bit binaries running on 32-bit kernels will not see + shmmax being clamped. + + And finding out whether the compat symbol is used would require checking + the underlying kernel against the current ABI. The shmall and shmmni + already provided enough coverage. */ + + struct test_shminfo tipcinfo; + tipcinfo.shmall = read_proc_file ("/proc/sys/kernel/shmall"); + tipcinfo.shmmni = read_proc_file ("/proc/sys/kernel/shmmni"); + + int shmidx; + + /* Note: SHM_INFO does not return a shminfo, but rather a 'struct shm_info'. + It is tricky to verify its values since the syscall returns system wide + resources consumed by shared memory. The shmctl implementation handles + SHM_INFO as IPC_INFO, so the IPC_INFO test should validate SHM_INFO as + well. */ + + { + struct shminfo ipcinfo; + shmidx = shmctl (shmid, IPC_INFO, (struct shmid_ds *) &ipcinfo); + if (shmidx == -1) + FAIL_EXIT1 ("shmctl with IPC_INFO failed: %m"); + + TEST_COMPARE (ipcinfo.shmall, tipcinfo.shmall); + TEST_COMPARE (ipcinfo.shmmni, tipcinfo.shmmni); + } + + /* We check if the created shared memory shows in the global list. */ + bool found = false; + for (int i = 0; i <= shmidx; i++) + { + /* We can't tell apart if SHM_STAT_ANY is not supported (kernel older + than 4.17) or if the index used is invalid. So it just check if + value returned from a valid call matches the created message + queue. */ + check_shminfo (i, key, SHM_STAT_ANY); + + if (check_shminfo (i, key, SHM_STAT)) + { + found = true; + break; + } + } + + if (!found) + FAIL_EXIT1 ("shmctl with SHM_STAT/SHM_STAT_ANY could not find the " + "created shared memory"); + + if (shmctl (shmid, IPC_RMID, NULL) == -1) + FAIL_EXIT1 ("shmctl failed"); + + return 0; +} + +#include diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile index a6736aef25..e821d95fa3 100644 --- a/sysdeps/x86/Makefile +++ b/sysdeps/x86/Makefile @@ -12,10 +12,39 @@ endif ifeq ($(subdir),setjmp) gen-as-const-headers += jmp_buf-ssp.sym sysdep_routines += __longjmp_cancel +ifneq ($(enable-cet),no) +ifneq ($(have-tunables),no) +tests += tst-setjmp-cet +tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on +endif +endif endif ifeq ($(subdir),string) sysdep_routines += cacheinfo + +tests += \ + tst-memchr-rtm \ + tst-memcmp-rtm \ + tst-memmove-rtm \ + tst-memrchr-rtm \ + tst-memset-rtm \ + tst-strchr-rtm \ + tst-strcpy-rtm \ + tst-strlen-rtm \ + tst-strncmp-rtm \ + tst-strrchr-rtm + +CFLAGS-tst-memchr-rtm.c += -mrtm +CFLAGS-tst-memcmp-rtm.c += -mrtm +CFLAGS-tst-memmove-rtm.c += -mrtm +CFLAGS-tst-memrchr-rtm.c += -mrtm +CFLAGS-tst-memset-rtm.c += -mrtm +CFLAGS-tst-strchr-rtm.c += -mrtm +CFLAGS-tst-strcpy-rtm.c += -mrtm +CFLAGS-tst-strlen-rtm.c += -mrtm +CFLAGS-tst-strncmp-rtm.c += -mrtm +CFLAGS-tst-strrchr-rtm.c += -mrtm endif ifneq ($(enable-cet),no) diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c index 217c21c34f..3fb4a028d8 100644 --- a/sysdeps/x86/cacheinfo.c +++ b/sysdeps/x86/cacheinfo.c @@ -808,7 +808,7 @@ init_cacheinfo (void) threads = 1 << ((ecx >> 12) & 0x0f); } - if (threads == 0) + if (threads == 0 || cpu_features->basic.family >= 0x17) { /* If APIC ID width is not available, use logical processor count. */ @@ -823,8 +823,22 @@ init_cacheinfo (void) if (threads > 0) shared /= threads; - /* Account for exclusive L2 and L3 caches. */ - shared += core; + /* Get shared cache per ccx for Zen architectures. */ + if (cpu_features->basic.family >= 0x17) + { + unsigned int eax; + + /* Get number of threads share the L3 cache in CCX. */ + __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx); + + unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1; + shared *= threads_per_ccx; + } + else + { + /* Account for exclusive L2 and L3 caches. */ + shared += core; + } } } @@ -854,14 +868,20 @@ init_cacheinfo (void) __x86_shared_cache_size = shared; } - /* The large memcpy micro benchmark in glibc shows that 6 times of - shared cache size is the approximate value above which non-temporal - store becomes faster on a 8-core processor. This is the 3/4 of the - total shared cache size. */ + /* The default setting for the non_temporal threshold is 3/4 of one + thread's share of the chip's cache. For most Intel and AMD processors + with an initial release date between 2017 and 2020, a thread's typical + share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 + threshold leaves 125 KBytes to 500 KBytes of the thread's data + in cache after a maximum temporal copy, which will maintain + in cache a reasonable portion of the thread's stack and other + active data. If the threshold is set higher than one thread's + share of the cache, it has a substantial risk of negatively + impacting the performance of other threads running on the chip. */ __x86_shared_non_temporal_threshold = (cpu_features->non_temporal_threshold != 0 ? cpu_features->non_temporal_threshold - : __x86_shared_cache_size * threads * 3 / 4); + : __x86_shared_cache_size * 3 / 4); /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ unsigned int minimum_rep_movsb_threshold; diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 4c24ba7c31..484efe7a0f 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -71,7 +71,6 @@ update_usable (struct cpu_features *cpu_features) CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_6); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_7); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_9); - CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_11); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_12); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_13); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_17); @@ -318,6 +317,9 @@ update_usable (struct cpu_features *cpu_features) /* Determine if PKU is usable. */ if (CPU_FEATURES_CPU_P (cpu_features, OSPKE)) CPU_FEATURE_SET (cpu_features, PKU); + + if (CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT)) + CPU_FEATURE_UNSET (cpu_features, RTM); } static void @@ -516,11 +518,39 @@ init_cpu_features (struct cpu_features *cpu_features) break; } - /* Disable TSX on some Haswell processors to avoid TSX on kernels that - weren't updated with the latest microcode package (which disables - broken feature by default). */ + /* Disable TSX on some processors to avoid TSX on kernels that + weren't updated with the latest microcode package (which + disables broken feature by default). */ switch (model) { + case 0x55: + if (stepping <= 5) + goto disable_tsx; + break; + case 0x8e: + /* NB: Although the errata documents that for model == 0x8e, + only 0xb stepping or lower are impacted, the intention of + the errata was to disable TSX on all client processors on + all steppings. Include 0xc stepping which is an Intel + Core i7-8665U, a client mobile processor. */ + case 0x9e: + if (stepping > 0xc) + break; + /* Fall through. */ + case 0x4e: + case 0x5e: + { + /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for + processors listed in: + +https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html + */ +disable_tsx: + CPU_FEATURE_UNSET (cpu_features, HLE); + CPU_FEATURE_UNSET (cpu_features, RTM); + CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); + } + break; case 0x3f: /* Xeon E7 v3 with stepping >= 4 has working TSX. */ if (stepping >= 4) @@ -546,8 +576,24 @@ init_cpu_features (struct cpu_features *cpu_features) cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] |= bit_arch_Prefer_No_VZEROUPPER; else - cpu_features->preferred[index_arch_Prefer_No_AVX512] - |= bit_arch_Prefer_No_AVX512; + { + cpu_features->preferred[index_arch_Prefer_No_AVX512] + |= bit_arch_Prefer_No_AVX512; + + /* Avoid RTM abort triggered by VZEROUPPER inside a + transactionally executing RTM region. */ + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] + |= bit_arch_Prefer_No_VZEROUPPER; + + /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp + requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp + requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB, + AVX2 strcmp is faster than EVEX strcmp. */ + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] + |= bit_arch_Prefer_AVX2_STRCMP; + } } /* This spells out "AuthenticAMD" or "HygonGenuine". */ else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h index a0b9b9177c..8995a15f09 100644 --- a/sysdeps/x86/cpu-features.h +++ b/sysdeps/x86/cpu-features.h @@ -295,7 +295,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define bit_cpu_AVX512_VP2INTERSECT (1u << 8) #define bit_cpu_INDEX_7_EDX_9 (1u << 9) #define bit_cpu_MD_CLEAR (1u << 10) -#define bit_cpu_INDEX_7_EDX_11 (1u << 11) +#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11) #define bit_cpu_INDEX_7_EDX_12 (1u << 12) #define bit_cpu_INDEX_7_EDX_13 (1u << 13) #define bit_cpu_SERIALIZE (1u << 14) @@ -508,7 +508,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7 #define index_cpu_INDEX_7_EDX_9 COMMON_CPUID_INDEX_7 #define index_cpu_MD_CLEAR COMMON_CPUID_INDEX_7 -#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7 +#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7 #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7 #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7 #define index_cpu_SERIALIZE COMMON_CPUID_INDEX_7 @@ -721,7 +721,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define reg_AVX512_VP2INTERSECT edx #define reg_INDEX_7_EDX_9 edx #define reg_MD_CLEAR edx -#define reg_INDEX_7_EDX_11 edx +#define reg_RTM_ALWAYS_ABORT edx #define reg_INDEX_7_EDX_12 edx #define reg_INDEX_7_EDX_13 edx #define reg_SERIALIZE edx @@ -804,6 +804,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define bit_arch_Prefer_FSRM (1u << 13) #define bit_arch_Prefer_No_AVX512 (1u << 14) #define bit_arch_MathVec_Prefer_No_AVX512 (1u << 15) +#define bit_arch_Prefer_AVX2_STRCMP (1u << 16) #define index_arch_Fast_Rep_String PREFERRED_FEATURE_INDEX_1 #define index_arch_Fast_Copy_Backward PREFERRED_FEATURE_INDEX_1 @@ -821,6 +822,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define index_arch_Prefer_No_AVX512 PREFERRED_FEATURE_INDEX_1 #define index_arch_MathVec_Prefer_No_AVX512 PREFERRED_FEATURE_INDEX_1 #define index_arch_Prefer_FSRM PREFERRED_FEATURE_INDEX_1 +#define index_arch_Prefer_AVX2_STRCMP PREFERRED_FEATURE_INDEX_1 /* XCR0 Feature flags. */ #define bit_XMM_state (1u << 1) diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c index 588bbf9448..b251a91af3 100644 --- a/sysdeps/x86/cpu-tunables.c +++ b/sysdeps/x86/cpu-tunables.c @@ -238,6 +238,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, Fast_Copy_Backward, disable, 18); + CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH + (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18); } break; case 19: diff --git a/sysdeps/x86/dl-cet.c b/sysdeps/x86/dl-cet.c index 03572f7af6..3cc54a8d53 100644 --- a/sysdeps/x86/dl-cet.c +++ b/sysdeps/x86/dl-cet.c @@ -47,7 +47,10 @@ dl_cet_check (struct link_map *m, const char *program) /* No legacy object check if both IBT and SHSTK are always on. */ if (enable_ibt_type == cet_always_on && enable_shstk_type == cet_always_on) - return; + { + THREAD_SETMEM (THREAD_SELF, header.feature_1, GL(dl_x86_feature_1)); + return; + } /* Check if IBT is enabled by kernel. */ bool ibt_enabled diff --git a/sysdeps/x86/dl-prop.h b/sysdeps/x86/dl-prop.h index 89911e19e2..4eb3b85a7b 100644 --- a/sysdeps/x86/dl-prop.h +++ b/sysdeps/x86/dl-prop.h @@ -145,15 +145,15 @@ _dl_process_cet_property_note (struct link_map *l, } static inline void __attribute__ ((unused)) -_dl_process_pt_note (struct link_map *l, const ElfW(Phdr) *ph) +_dl_process_pt_note (struct link_map *l, int fd, const ElfW(Phdr) *ph) { const ElfW(Nhdr) *note = (const void *) (ph->p_vaddr + l->l_addr); _dl_process_cet_property_note (l, note, ph->p_memsz, ph->p_align); } static inline int __attribute__ ((always_inline)) -_dl_process_gnu_property (struct link_map *l, uint32_t type, uint32_t datasz, - void *data) +_dl_process_gnu_property (struct link_map *l, int fd, uint32_t type, + uint32_t datasz, void *data) { return 0; } diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c index 080c58e70b..527de3b5d9 100644 --- a/sysdeps/x86/tst-get-cpu-features.c +++ b/sysdeps/x86/tst-get-cpu-features.c @@ -183,6 +183,7 @@ do_test (void) CHECK_CPU_FEATURE (FSRM); CHECK_CPU_FEATURE (AVX512_VP2INTERSECT); CHECK_CPU_FEATURE (MD_CLEAR); + CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT); CHECK_CPU_FEATURE (SERIALIZE); CHECK_CPU_FEATURE (HYBRID); CHECK_CPU_FEATURE (TSXLDTRK); @@ -336,6 +337,7 @@ do_test (void) CHECK_CPU_FEATURE_USABLE (FSRM); CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT); CHECK_CPU_FEATURE_USABLE (MD_CLEAR); + CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT); CHECK_CPU_FEATURE_USABLE (SERIALIZE); CHECK_CPU_FEATURE_USABLE (HYBRID); CHECK_CPU_FEATURE_USABLE (TSXLDTRK); diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c new file mode 100644 index 0000000000..e47494011e --- /dev/null +++ b/sysdeps/x86/tst-memchr-rtm.c @@ -0,0 +1,54 @@ +/* Test case for memchr inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + string1[100] = 'c'; + string1[STRING_SIZE - 100] = 'c'; + char *p = memchr (string1, 'c', STRING_SIZE); + if (p == &string1[100]) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + char *p = memchr (string1, 'c', STRING_SIZE); + if (p == &string1[100]) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("memchr", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c new file mode 100644 index 0000000000..e4c8a623bb --- /dev/null +++ b/sysdeps/x86/tst-memcmp-rtm.c @@ -0,0 +1,52 @@ +/* Test case for memcmp inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; +char string2[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + memset (string2, 'a', STRING_SIZE); + if (memcmp (string1, string2, STRING_SIZE) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + if (memcmp (string1, string2, STRING_SIZE) == 0) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("memcmp", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c new file mode 100644 index 0000000000..4bf97ef1e3 --- /dev/null +++ b/sysdeps/x86/tst-memmove-rtm.c @@ -0,0 +1,53 @@ +/* Test case for memmove inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; +char string2[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + if (memmove (string2, string1, STRING_SIZE) == string2 + && memcmp (string2, string1, STRING_SIZE) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + if (memmove (string2, string1, STRING_SIZE) == string2 + && memcmp (string2, string1, STRING_SIZE) == 0) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("memmove", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c new file mode 100644 index 0000000000..a57a5a8eb9 --- /dev/null +++ b/sysdeps/x86/tst-memrchr-rtm.c @@ -0,0 +1,54 @@ +/* Test case for memrchr inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + string1[100] = 'c'; + string1[STRING_SIZE - 100] = 'c'; + char *p = memrchr (string1, 'c', STRING_SIZE); + if (p == &string1[STRING_SIZE - 100]) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + char *p = memrchr (string1, 'c', STRING_SIZE); + if (p == &string1[STRING_SIZE - 100]) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("memrchr", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c new file mode 100644 index 0000000000..bf343a4dad --- /dev/null +++ b/sysdeps/x86/tst-memset-rtm.c @@ -0,0 +1,45 @@ +/* Test case for memset inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + return EXIT_SUCCESS; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + memset (string1, 'a', STRING_SIZE); + return 0; +} + +static int +do_test (void) +{ + return do_test_1 ("memset", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-setjmp-cet.c b/sysdeps/x86/tst-setjmp-cet.c new file mode 100644 index 0000000000..42c795d2a8 --- /dev/null +++ b/sysdeps/x86/tst-setjmp-cet.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c new file mode 100644 index 0000000000..a82e29c072 --- /dev/null +++ b/sysdeps/x86/tst-strchr-rtm.c @@ -0,0 +1,54 @@ +/* Test case for strchr inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + string1[100] = 'c'; + string1[STRING_SIZE - 100] = 'c'; + char *p = strchr (string1, 'c'); + if (p == &string1[100]) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + char *p = strchr (string1, 'c'); + if (p == &string1[100]) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strchr", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c new file mode 100644 index 0000000000..2b2a583fb4 --- /dev/null +++ b/sysdeps/x86/tst-strcpy-rtm.c @@ -0,0 +1,53 @@ +/* Test case for strcpy inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; +char string2[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + if (strcpy (string2, string1) == string2 + && strcmp (string2, string1) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + if (strcpy (string2, string1) == string2 + && strcmp (string2, string1) == 0) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strcpy", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h new file mode 100644 index 0000000000..6ed9eca017 --- /dev/null +++ b/sysdeps/x86/tst-string-rtm.h @@ -0,0 +1,72 @@ +/* Test string function in a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +static int +do_test_1 (const char *name, unsigned int loop, int (*prepare) (void), + int (*function) (void)) +{ + if (!CPU_FEATURE_USABLE (RTM)) + return EXIT_UNSUPPORTED; + + int status = prepare (); + if (status != EXIT_SUCCESS) + return status; + + unsigned int i; + unsigned int naborts = 0; + unsigned int failed = 0; + for (i = 0; i < loop; i++) + { + failed |= function (); + if (_xbegin() == _XBEGIN_STARTED) + { + failed |= function (); + _xend(); + } + else + { + failed |= function (); + ++naborts; + } + } + + if (failed) + FAIL_EXIT1 ("%s() failed", name); + + if (naborts) + { + /* NB: Low single digit (<= 5%) noise-level aborts are normal for + TSX. */ + double rate = 100 * ((double) naborts) / ((double) loop); + if (rate > 5) + FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)", + rate, naborts, loop); + } + + return EXIT_SUCCESS; +} + +static int do_test (void); + +#include diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c new file mode 100644 index 0000000000..0dcf14db87 --- /dev/null +++ b/sysdeps/x86/tst-strlen-rtm.c @@ -0,0 +1,53 @@ +/* Test case for strlen inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + string1[STRING_SIZE - 100] = '\0'; + size_t len = strlen (string1); + if (len == STRING_SIZE - 100) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + size_t len = strlen (string1); + if (len == STRING_SIZE - 100) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strlen", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c new file mode 100644 index 0000000000..236ad951b5 --- /dev/null +++ b/sysdeps/x86/tst-strncmp-rtm.c @@ -0,0 +1,52 @@ +/* Test case for strncmp inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; +char string2[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + memset (string2, 'a', STRING_SIZE - 1); + if (strncmp (string1, string2, STRING_SIZE) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + if (strncmp (string1, string2, STRING_SIZE) == 0) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strncmp", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c new file mode 100644 index 0000000000..e32bfaf5f5 --- /dev/null +++ b/sysdeps/x86/tst-strrchr-rtm.c @@ -0,0 +1,53 @@ +/* Test case for strrchr inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + string1[STRING_SIZE - 100] = 'c'; + char *p = strrchr (string1, 'c'); + if (p == &string1[STRING_SIZE - 100]) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + char *p = strrchr (string1, 'c'); + if (p == &string1[STRING_SIZE - 100]) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strrchr", LOOP, prepare, function); +} diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index 42b97c5cc7..020044da80 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -20,6 +20,8 @@ endif ifeq ($(subdir),string) sysdep_routines += strcasecmp_l-nonascii strncase_l-nonascii gen-as-const-headers += locale-defines.sym +tests += \ + tst-rsi-strlen endif ifeq ($(subdir),elf) @@ -150,6 +152,11 @@ ifeq ($(subdir),csu) gen-as-const-headers += tlsdesc.sym rtld-offsets.sym endif +ifeq ($(subdir),wcsmbs) +tests += \ + tst-rsi-wcslen +endif + $(objpfx)x86_64/tst-x86_64mod-1.os: $(objpfx)tst-x86_64mod-1.os $(make-target-directory) rm -f $@ diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure old mode 100644 new mode 100755 index 84f82c2406..fc1840e23f --- a/sysdeps/x86_64/configure +++ b/sysdeps/x86_64/configure @@ -107,39 +107,6 @@ if test x"$build_mathvec" = xnotset; then build_mathvec=yes fi -if test "$static_pie" = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for linker static PIE support" >&5 -$as_echo_n "checking for linker static PIE support... " >&6; } -if ${libc_cv_ld_static_pie+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat > conftest.s <<\EOF - .text - .global _start - .weak foo -_start: - leaq foo(%rip), %rax -EOF - libc_cv_pie_option="-Wl,-pie" - if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostartfiles -nostdlib $no_ssp $libc_cv_pie_option -o conftest conftest.s 1>&5' - { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 - (eval $ac_try) 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; }; then - libc_cv_ld_static_pie=yes - else - libc_cv_ld_static_pie=no - fi -rm -f conftest* -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_ld_static_pie" >&5 -$as_echo "$libc_cv_ld_static_pie" >&6; } - if test "$libc_cv_ld_static_pie" != yes; then - as_fn_error $? "linker support for static PIE needed" "$LINENO" 5 - fi -fi - $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac index cdaba0c075..611a7d9ba3 100644 --- a/sysdeps/x86_64/configure.ac +++ b/sysdeps/x86_64/configure.ac @@ -53,31 +53,6 @@ if test x"$build_mathvec" = xnotset; then build_mathvec=yes fi -dnl Check if linker supports static PIE with the fix for -dnl -dnl https://sourceware.org/bugzilla/show_bug.cgi?id=21782 -dnl -if test "$static_pie" = yes; then - AC_CACHE_CHECK(for linker static PIE support, libc_cv_ld_static_pie, [dnl -cat > conftest.s <<\EOF - .text - .global _start - .weak foo -_start: - leaq foo(%rip), %rax -EOF - libc_cv_pie_option="-Wl,-pie" - if AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostartfiles -nostdlib $no_ssp $libc_cv_pie_option -o conftest conftest.s 1>&AS_MESSAGE_LOG_FD); then - libc_cv_ld_static_pie=yes - else - libc_cv_ld_static_pie=no - fi -rm -f conftest*]) - if test "$libc_cv_ld_static_pie" != yes; then - AC_MSG_ERROR([linker support for static PIE needed]) - fi -fi - dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index ca73d8fef9..363a749cb2 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -315,16 +315,22 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, { # ifndef RTLD_BOOTSTRAP if (sym_map != map - && sym_map->l_type != lt_executable && !sym_map->l_relocated) { const char *strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]); - _dl_error_printf ("\ + if (sym_map->l_type == lt_executable) + _dl_fatal_printf ("\ +%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \ +and creates an unsatisfiable circular dependency.\n", + RTLD_PROGNAME, strtab + refsym->st_name, + map->l_name); + else + _dl_error_printf ("\ %s: Relink `%s' with `%s' for IFUNC symbol `%s'\n", - RTLD_PROGNAME, map->l_name, - sym_map->l_name, - strtab + refsym->st_name); + RTLD_PROGNAME, map->l_name, + sym_map->l_name, + strtab + refsym->st_name); } # endif value = ((ElfW(Addr) (*) (void)) value) (); diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h index 7659758972..e5fd5ac9cb 100644 --- a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h @@ -32,7 +32,7 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, AVX2)) return OPTIMIZE (fma); - if (CPU_FEATURE_USABLE_P (cpu_features, FMA)) + if (CPU_FEATURE_USABLE_P (cpu_features, FMA4)) return OPTIMIZE (fma4); return OPTIMIZE (sse2); diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index a5c879d2af..070e5ef90b 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -21,9 +21,11 @@ #ifdef USE_AS_WMEMCHR # define MEMCHR wmemchr # define PCMPEQ pcmpeqd +# define CHAR_PER_VEC 4 #else # define MEMCHR memchr # define PCMPEQ pcmpeqb +# define CHAR_PER_VEC 16 #endif /* fast SSE2 version with using pmaxub and 64 byte loop */ @@ -33,15 +35,14 @@ ENTRY(MEMCHR) movd %esi, %xmm1 mov %edi, %ecx +#ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +#endif #ifdef USE_AS_WMEMCHR test %RDX_LP, %RDX_LP jz L(return_null) - shl $2, %RDX_LP #else -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx -# endif punpcklbw %xmm1, %xmm1 test %RDX_LP, %RDX_LP jz L(return_null) @@ -60,13 +61,16 @@ ENTRY(MEMCHR) test %eax, %eax jnz L(matches_1) - sub $16, %rdx + sub $CHAR_PER_VEC, %rdx jbe L(return_null) add $16, %rdi and $15, %ecx and $-16, %rdi +#ifdef USE_AS_WMEMCHR + shr $2, %ecx +#endif add %rcx, %rdx - sub $64, %rdx + sub $(CHAR_PER_VEC * 4), %rdx jbe L(exit_loop) jmp L(loop_prolog) @@ -77,16 +81,21 @@ L(crosscache): movdqa (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 -/* Check if there is a match. */ + /* Check if there is a match. */ pmovmskb %xmm0, %eax -/* Remove the leading bytes. */ + /* Remove the leading bytes. */ sar %cl, %eax test %eax, %eax je L(unaligned_no_match) -/* Check which byte is a match. */ + /* Check which byte is a match. */ bsf %eax, %eax - +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) add %rdi, %rax add %rcx, %rax @@ -94,15 +103,18 @@ L(crosscache): .p2align 4 L(unaligned_no_match): - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void possible addition overflow. */ neg %rcx add $16, %rcx +#ifdef USE_AS_WMEMCHR + shr $2, %ecx +#endif sub %rcx, %rdx jbe L(return_null) add $16, %rdi - sub $64, %rdx + sub $(CHAR_PER_VEC * 4), %rdx jbe L(exit_loop) .p2align 4 @@ -135,7 +147,7 @@ L(loop_prolog): test $0x3f, %rdi jz L(align64_loop) - sub $64, %rdx + sub $(CHAR_PER_VEC * 4), %rdx jbe L(exit_loop) movdqa (%rdi), %xmm0 @@ -167,11 +179,14 @@ L(loop_prolog): mov %rdi, %rcx and $-64, %rdi and $63, %ecx +#ifdef USE_AS_WMEMCHR + shr $2, %ecx +#endif add %rcx, %rdx .p2align 4 L(align64_loop): - sub $64, %rdx + sub $(CHAR_PER_VEC * 4), %rdx jbe L(exit_loop) movdqa (%rdi), %xmm0 movdqa 16(%rdi), %xmm2 @@ -218,7 +233,7 @@ L(align64_loop): .p2align 4 L(exit_loop): - add $32, %edx + add $(CHAR_PER_VEC * 2), %edx jle L(exit_loop_32) movdqa (%rdi), %xmm0 @@ -238,7 +253,7 @@ L(exit_loop): pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32_1) - sub $16, %edx + sub $CHAR_PER_VEC, %edx jle L(return_null) PCMPEQ 48(%rdi), %xmm1 @@ -250,13 +265,13 @@ L(exit_loop): .p2align 4 L(exit_loop_32): - add $32, %edx + add $(CHAR_PER_VEC * 2), %edx movdqa (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches_1) - sub $16, %edx + sub $CHAR_PER_VEC, %edx jbe L(return_null) PCMPEQ 16(%rdi), %xmm1 @@ -293,7 +308,13 @@ L(matches32): .p2align 4 L(matches_1): bsf %eax, %eax +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) add %rdi, %rax ret @@ -301,7 +322,13 @@ L(matches_1): .p2align 4 L(matches16_1): bsf %eax, %eax +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) lea 16(%rdi, %rax), %rax ret @@ -309,7 +336,13 @@ L(matches16_1): .p2align 4 L(matches32_1): bsf %eax, %eax +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) lea 32(%rdi, %rax), %rax ret @@ -317,7 +350,13 @@ L(matches32_1): .p2align 4 L(matches48_1): bsf %eax, %eax +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) lea 48(%rdi, %rax), %rax ret diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 395e432c09..da1446d731 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -43,7 +43,45 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ memmove-avx512-unaligned-erms \ memset-sse2-unaligned-erms \ memset-avx2-unaligned-erms \ - memset-avx512-unaligned-erms + memset-avx512-unaligned-erms \ + memchr-avx2-rtm \ + memcmp-avx2-movbe-rtm \ + memmove-avx-unaligned-erms-rtm \ + memrchr-avx2-rtm \ + memset-avx2-unaligned-erms-rtm \ + rawmemchr-avx2-rtm \ + strchr-avx2-rtm \ + strcmp-avx2-rtm \ + strchrnul-avx2-rtm \ + stpcpy-avx2-rtm \ + stpncpy-avx2-rtm \ + strcat-avx2-rtm \ + strcpy-avx2-rtm \ + strlen-avx2-rtm \ + strncat-avx2-rtm \ + strncmp-avx2-rtm \ + strncpy-avx2-rtm \ + strnlen-avx2-rtm \ + strrchr-avx2-rtm \ + memchr-evex \ + memcmp-evex-movbe \ + memmove-evex-unaligned-erms \ + memrchr-evex \ + memset-evex-unaligned-erms \ + rawmemchr-evex \ + stpcpy-evex \ + stpncpy-evex \ + strcat-evex \ + strchr-evex \ + strchrnul-evex \ + strcmp-evex \ + strcpy-evex \ + strlen-evex \ + strncat-evex \ + strncmp-evex \ + strncpy-evex \ + strnlen-evex \ + strrchr-evex CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 @@ -59,8 +97,24 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ wcscpy-ssse3 wcscpy-c \ wcschr-sse2 wcschr-avx2 \ wcsrchr-sse2 wcsrchr-avx2 \ - wcsnlen-sse4_1 wcsnlen-c \ - wcslen-sse2 wcslen-avx2 wcsnlen-avx2 + wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \ + wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \ + wcschr-avx2-rtm \ + wcscmp-avx2-rtm \ + wcslen-avx2-rtm \ + wcsncmp-avx2-rtm \ + wcsnlen-avx2-rtm \ + wcsrchr-avx2-rtm \ + wmemchr-avx2-rtm \ + wmemcmp-avx2-movbe-rtm \ + wcschr-evex \ + wcscmp-evex \ + wcslen-evex \ + wcsncmp-evex \ + wcsnlen-evex \ + wcsrchr-evex \ + wmemchr-evex \ + wmemcmp-evex-movbe endif ifeq ($(subdir),debug) diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h index f4e311d470..f450c786f0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h @@ -21,16 +21,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index f93ec39d98..920e64241e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -43,6 +43,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memchr, CPU_FEATURE_USABLE (AVX2), __memchr_avx2) + IFUNC_IMPL_ADD (array, i, memchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __memchr_evex) IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) /* Support sysdeps/x86_64/multiarch/memcmp.c. */ @@ -51,6 +60,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (MOVBE)), __memcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __memcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), __memcmp_sse4_1) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), @@ -64,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX512F), __memmove_chk_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __memmove_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __memmove_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX), @@ -75,6 +94,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX), __memmove_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX512VL), + __memmove_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX512VL), + __memmove_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (SSSE3), __memmove_chk_ssse3_back) @@ -97,14 +130,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX), __memmove_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memmove, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX512VL), + __memmove_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX512VL), + __memmove_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX512F), __memmove_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, memmove, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __memmove_avx512_unaligned) IFUNC_IMPL_ADD (array, i, memmove, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __memmove_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), __memmove_ssse3_back) @@ -121,6 +168,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memrchr, CPU_FEATURE_USABLE (AVX2), __memrchr_avx2) + IFUNC_IMPL_ADD (array, i, memrchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __memrchr_evex) + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2)) #ifdef SHARED @@ -139,10 +195,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __memset_chk_avx2_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, - CPU_FEATURE_USABLE (AVX512F), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_chk_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_chk_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), __memset_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, - CPU_FEATURE_USABLE (AVX512F), + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), __memset_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __memset_chk, CPU_FEATURE_USABLE (AVX512F), @@ -164,10 +238,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __memset_avx2_unaligned_erms) IFUNC_IMPL_ADD (array, i, memset, - CPU_FEATURE_USABLE (AVX512F), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), __memset_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memset, - CPU_FEATURE_USABLE (AVX512F), + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), __memset_avx512_unaligned) IFUNC_IMPL_ADD (array, i, memset, CPU_FEATURE_USABLE (AVX512F), @@ -179,20 +271,51 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, rawmemchr, CPU_FEATURE_USABLE (AVX2), __rawmemchr_avx2) + IFUNC_IMPL_ADD (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __rawmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __rawmemchr_evex) IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/strlen.c. */ IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __strlen_avx2) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strlen_evex) IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) /* Support sysdeps/x86_64/multiarch/strnlen.c. */ IFUNC_IMPL (i, name, strnlen, IFUNC_IMPL_ADD (array, i, strnlen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __strnlen_avx2) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strnlen_evex) IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ @@ -201,6 +324,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) + IFUNC_IMPL_ADD (array, i, stpncpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __stpncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpncpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __stpncpy_evex) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) @@ -211,6 +342,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) + IFUNC_IMPL_ADD (array, i, stpcpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __stpcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpcpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __stpcpy_evex) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) @@ -245,6 +384,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcat, IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2), __strcat_avx2) + IFUNC_IMPL_ADD (array, i, strcat, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcat, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strcat_evex) IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) @@ -255,6 +402,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strchr, CPU_FEATURE_USABLE (AVX2), __strchr_avx2) + IFUNC_IMPL_ADD (array, i, strchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strchr_evex) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) @@ -263,6 +419,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strchrnul, CPU_FEATURE_USABLE (AVX2), __strchrnul_avx2) + IFUNC_IMPL_ADD (array, i, strchrnul, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strchrnul_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchrnul, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strchrnul_evex) IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2)) /* Support sysdeps/x86_64/multiarch/strrchr.c. */ @@ -270,6 +435,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strrchr, CPU_FEATURE_USABLE (AVX2), __strrchr_avx2) + IFUNC_IMPL_ADD (array, i, strrchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strrchr_evex) IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2)) /* Support sysdeps/x86_64/multiarch/strcmp.c. */ @@ -277,6 +450,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (AVX2), __strcmp_avx2) + IFUNC_IMPL_ADD (array, i, strcmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strcmp_evex) IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), __strcmp_sse42) IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), @@ -288,6 +470,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcpy, IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2), __strcpy_avx2) + IFUNC_IMPL_ADD (array, i, strcpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strcpy_evex) IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) @@ -331,6 +521,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strncat, IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2), __strncat_avx2) + IFUNC_IMPL_ADD (array, i, strncat, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncat, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strncat_evex) IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, 1, @@ -341,6 +539,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strncpy, IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2), __strncpy_avx2) + IFUNC_IMPL_ADD (array, i, strncpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strncpy_evex) IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, @@ -370,6 +576,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcschr, CPU_FEATURE_USABLE (AVX2), __wcschr_avx2) + IFUNC_IMPL_ADD (array, i, wcschr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcschr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcschr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcschr_evex) IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2)) /* Support sysdeps/x86_64/multiarch/wcsrchr.c. */ @@ -377,6 +592,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsrchr, CPU_FEATURE_USABLE (AVX2), __wcsrchr_avx2) + IFUNC_IMPL_ADD (array, i, wcsrchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcsrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcsrchr_evex) IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2)) /* Support sysdeps/x86_64/multiarch/wcscmp.c. */ @@ -384,6 +608,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcscmp, CPU_FEATURE_USABLE (AVX2), __wcscmp_avx2) + IFUNC_IMPL_ADD (array, i, wcscmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcscmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcscmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcscmp_evex) IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2)) /* Support sysdeps/x86_64/multiarch/wcsncmp.c. */ @@ -391,6 +624,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsncmp, CPU_FEATURE_USABLE (AVX2), __wcsncmp_avx2) + IFUNC_IMPL_ADD (array, i, wcsncmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcsncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsncmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcsncmp_evex) IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2)) /* Support sysdeps/x86_64/multiarch/wcscpy.c. */ @@ -402,15 +644,40 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wcslen.c. */ IFUNC_IMPL (i, name, wcslen, IFUNC_IMPL_ADD (array, i, wcslen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __wcslen_avx2) + IFUNC_IMPL_ADD (array, i, wcslen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __wcslen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcslen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcslen_evex) + IFUNC_IMPL_ADD (array, i, wcslen, + CPU_FEATURE_USABLE (SSE4_1), + __wcslen_sse4_1) IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ IFUNC_IMPL (i, name, wcsnlen, IFUNC_IMPL_ADD (array, i, wcsnlen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __wcsnlen_avx2) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __wcsnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcsnlen_evex) IFUNC_IMPL_ADD (array, i, wcsnlen, CPU_FEATURE_USABLE (SSE4_1), __wcsnlen_sse4_1) @@ -421,6 +688,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wmemchr, CPU_FEATURE_USABLE (AVX2), __wmemchr_avx2) + IFUNC_IMPL_ADD (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wmemchr_evex) IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ @@ -429,6 +705,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (MOVBE)), __wmemcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __wmemcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), __wmemcmp_sse4_1) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), @@ -443,7 +729,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __wmemset_avx2_unaligned) IFUNC_IMPL_ADD (array, i, wmemset, - CPU_FEATURE_USABLE (AVX512F), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX512VL), + __wmemset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX512VL), __wmemset_avx512_unaligned)) #ifdef SHARED @@ -453,10 +746,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX512F), __memcpy_chk_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __memcpy_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __memcpy_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX), @@ -464,6 +757,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX), __memcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX512VL), + __memcpy_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX512VL), + __memcpy_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (SSSE3), __memcpy_chk_ssse3_back) @@ -486,6 +793,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX), __memcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, + CPU_FEATURE_USABLE (AVX512VL), + __memcpy_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + CPU_FEATURE_USABLE (AVX512VL), + __memcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), @@ -494,10 +815,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX512F), __memcpy_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, memcpy, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __memcpy_avx512_unaligned) IFUNC_IMPL_ADD (array, i, memcpy, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __memcpy_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, memcpy, 1, @@ -511,10 +832,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX512F), __mempcpy_chk_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __mempcpy_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __mempcpy_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX), @@ -522,6 +843,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX), __mempcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (SSSE3), __mempcpy_chk_ssse3_back) @@ -542,10 +877,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX512F), __mempcpy_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, mempcpy, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __mempcpy_avx512_unaligned) IFUNC_IMPL_ADD (array, i, mempcpy, - CPU_FEATURE_USABLE (AVX512F), + CPU_FEATURE_USABLE (AVX512VL), __mempcpy_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (AVX), @@ -553,6 +888,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (AVX), __mempcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_evex_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), @@ -568,6 +917,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (AVX2), __strncmp_avx2) + IFUNC_IMPL_ADD (array, i, strncmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strncmp_evex) IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), __strncmp_sse42) IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), @@ -582,6 +939,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __wmemset_chk, CPU_FEATURE_USABLE (AVX2), __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + CPU_FEATURE_USABLE (AVX512VL), + __wmemset_chk_evex_unaligned) IFUNC_IMPL_ADD (array, i, __wmemset_chk, CPU_FEATURE_USABLE (AVX512F), __wmemset_chk_avx512_unaligned)) diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index 0e21b3a628..4f96c2764a 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -23,17 +23,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURE_USABLE_P (cpu_features, MOVBE) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2_movbe); + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex_movbe); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_movbe_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2_movbe); + } if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) return OPTIMIZE (sse4_1); diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index 9ada03aa43..db26210e3b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -29,6 +29,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) @@ -48,21 +56,42 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) { - if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) - return OPTIMIZE (avx512_no_vzeroupper); + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx512_unaligned_erms); - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (avx512_unaligned_erms); + return OPTIMIZE (avx512_unaligned); + } - return OPTIMIZE (avx512_unaligned); + return OPTIMIZE (avx512_no_vzeroupper); } if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (avx_unaligned_erms); + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (evex_unaligned_erms); + + return OPTIMIZE (evex_unaligned); + } + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx_unaligned_erms_rtm); + + return OPTIMIZE (avx_unaligned_rtm); + } + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx_unaligned_erms); - return OPTIMIZE (avx_unaligned); + return OPTIMIZE (avx_unaligned); + } } if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h index f52613d372..57029fc17b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h @@ -27,6 +27,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) @@ -45,21 +53,44 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) { - if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) - return OPTIMIZE (avx512_no_vzeroupper); + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx512_unaligned_erms); - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (avx512_unaligned_erms); + return OPTIMIZE (avx512_unaligned); + } - return OPTIMIZE (avx512_unaligned); + return OPTIMIZE (avx512_no_vzeroupper); } if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (avx2_unaligned_erms); - else - return OPTIMIZE (avx2_unaligned); + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (evex_unaligned_erms); + + return OPTIMIZE (evex_unaligned); + } + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx2_unaligned_erms_rtm); + + return OPTIMIZE (avx2_unaligned_rtm); + } + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx2_unaligned_erms); + + return OPTIMIZE (avx2_unaligned); + } } if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index 63b0dc0d96..35741f3ec8 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -25,16 +25,27 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h new file mode 100644 index 0000000000..39e3347378 --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h @@ -0,0 +1,52 @@ +/* Common definition for ifunc selections for wcslen and wcsnlen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } + + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h index 8cfce562fc..e06e8b4d80 100644 --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h @@ -20,6 +20,9 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; static inline void * @@ -27,14 +30,21 @@ IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) { - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) - && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) - return OPTIMIZE (avx512_unaligned); - else + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) + { + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + return OPTIMIZE (avx512_unaligned); + + return OPTIMIZE (evex_unaligned); + } + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_unaligned_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S new file mode 100644 index 0000000000..87b076c7c4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCHR +# define MEMCHR __memchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index e5a9abd211..0987616a1b 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -26,319 +26,407 @@ # ifdef USE_AS_WMEMCHR # define VPCMPEQ vpcmpeqd +# define VPBROADCAST vpbroadcastd +# define CHAR_SIZE 4 # else # define VPCMPEQ vpcmpeqb +# define VPBROADCAST vpbroadcastb +# define CHAR_SIZE 1 +# endif + +# ifdef USE_AS_RAWMEMCHR +# define ERAW_PTR_REG ecx +# define RRAW_PTR_REG rcx +# define ALGN_PTR_REG rdi +# else +# define ERAW_PTR_REG edi +# define RRAW_PTR_REG rdi +# define ALGN_PTR_REG rcx # endif # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCHR) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ +# ifdef __ILP32__ + /* Clear upper bits. */ + and %RDX_LP, %RDX_LP +# else test %RDX_LP, %RDX_LP +# endif jz L(null) # endif - movl %edi, %ecx - /* Broadcast CHAR to YMM0. */ + /* Broadcast CHAR to YMMMATCH. */ vmovd %esi, %xmm0 -# ifdef USE_AS_WMEMCHR - shl $2, %RDX_LP - vpbroadcastd %xmm0, %ymm0 -# else -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx -# endif - vpbroadcastb %xmm0, %ymm0 -# endif + VPBROADCAST %xmm0, %ymm0 /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 + VPCMPEQ (%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax - testl %eax, %eax - # ifndef USE_AS_RAWMEMCHR - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rdx - jbe L(zero) -# else - jnz L(first_vec_x0) + /* If length < CHAR_PER_VEC handle special. */ + cmpq $CHAR_PER_VEC, %rdx + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + addq %rdi, %rax + VZEROUPPER_RETURN # ifndef USE_AS_RAWMEMCHR - /* Adjust length. */ - addq %rcx, %rdx + .p2align 5 +L(first_vec_x0): + /* Check if first match was before length. */ + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %edx +# endif + xorl %ecx, %ecx + cmpl %eax, %edx + leaq (%rdi, %rax), %rax + cmovle %rcx, %rax + VZEROUPPER_RETURN - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) +L(null): + xorl %eax, %eax + ret # endif - jmp L(more_4x_vec) - .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 +L(cross_page_boundary): + /* Save pointer before aligning as its original value is + necessary for computer return address if byte is found or + adjusting length if it is not and this is memchr. */ + movq %rdi, %rcx + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr + and rdi for rawmemchr. */ + orq $(VEC_SIZE - 1), %ALGN_PTR_REG + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 vpmovmskb %ymm1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Calculate length until end of page (length checked for a + match). */ + leaq 1(%ALGN_PTR_REG), %rsi + subq %RRAW_PTR_REG, %rsi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %esi +# endif +# endif /* Remove the leading bytes. */ - sarl %cl, %eax - testl %eax, %eax - jz L(aligned_more) - tzcntl %eax, %eax + sarxl %ERAW_PTR_REG, %eax, %eax # ifndef USE_AS_RAWMEMCHR /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) + cmpq %rsi, %rdx + jbe L(first_vec_x0) # endif + testl %eax, %eax + jz L(cross_page_continue) + tzcntl %eax, %eax + addq %RRAW_PTR_REG, %rax +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax + incq %rdi addq %rdi, %rax - addq %rcx, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 -L(aligned_more): -# ifndef USE_AS_RAWMEMCHR - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition - overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx +L(first_vec_x2): + tzcntl %eax, %eax + addq $(VEC_SIZE + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN - /* Check the end of data. */ - subq %rcx, %rdx - jbe L(zero) -# endif + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + addq $(VEC_SIZE * 2 + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN - addq $VEC_SIZE, %rdi -# ifndef USE_AS_RAWMEMCHR - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) -# endif + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + addq $(VEC_SIZE * 3 + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN -L(more_4x_vec): + .p2align 4 +L(aligned_more): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +# ifndef USE_AS_RAWMEMCHR +L(cross_page_continue): + /* Align data to VEC_SIZE - 1. */ + xorl %ecx, %ecx + subl %edi, %ecx + orq $(VEC_SIZE - 1), %rdi + /* esi is for adjusting length to see if near the end. */ + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %esi +# endif +# else + orq $(VEC_SIZE - 1), %rdi +L(cross_page_continue): +# endif + /* Load first VEC regardless. */ + VPCMPEQ 1(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Adjust length. If near end handle specially. */ + subq %rsi, %rdx + jbe L(last_4x_vec_or_less) +# endif testl %eax, %eax jnz L(first_vec_x1) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x2) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi - -# ifndef USE_AS_RAWMEMCHR - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) # ifndef USE_AS_RAWMEMCHR - /* Adjust length. */ + /* Check if at last VEC_SIZE * 4 length. */ + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_4x_vec_or_less_cmpeq) + /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust + length. */ + incq %rdi + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif addq %rcx, %rdx +# else + /* Align data to VEC_SIZE * 4 - 1 for loop. */ + incq %rdi + orq $(VEC_SIZE * 4 - 1), %rdi # endif + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 - + VPCMPEQ 1(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 vpor %ymm1, %ymm2, %ymm5 vpor %ymm3, %ymm4, %ymm6 vpor %ymm5, %ymm6, %ymm5 - vpmovmskb %ymm5, %eax - testl %eax, %eax - jnz L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi - + vpmovmskb %ymm5, %ecx # ifdef USE_AS_RAWMEMCHR - jmp L(loop_4x_vec) + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) # else - subq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec) + testl %ecx, %ecx + jnz L(loop_4x_vec_end) -L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %edx - jle L(last_2x_vec) + subq $-(VEC_SIZE * 4), %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop_4x_vec) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + /* Fall through into less than 4 remaining vectors of length + case. */ + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax + .p2align 4 +L(last_4x_vec_or_less): +# ifdef USE_AS_WMEMCHR + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %edx +# endif + /* Check if first VEC contained match. */ testl %eax, %eax - jnz L(first_vec_x1) + jnz L(first_vec_x1_check) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax + /* If remaining length > VEC_SIZE * 2. */ + addl $(VEC_SIZE * 2), %edx + jg L(last_4x_vec) - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %edx - jle L(zero) +L(last_2x_vec): + /* If remaining length < VEC_SIZE. */ + addl $VEC_SIZE, %edx + jle L(zero_end) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + /* Check VEC2 and compare any match with remaining length. */ + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax - testl %eax, %eax - - jnz L(first_vec_x3_check) - xorl %eax, %eax - VZEROUPPER - ret + tzcntl %eax, %eax + cmpl %eax, %edx + jbe L(set_zero_end) + addq $(VEC_SIZE + 1), %rdi + addq %rdi, %rax +L(zero_end): + VZEROUPPER_RETURN .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %edx - VPCMPEQ (%rdi), %ymm0, %ymm1 +L(loop_4x_vec_end): +# endif + /* rawmemchr will fall through into this if match was found in + loop. */ + vpmovmskb %ymm1, %eax testl %eax, %eax + jnz L(last_vec_x1_return) - jnz L(first_vec_x0_check) - subl $VEC_SIZE, %edx - jle L(zero) - - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + vpmovmskb %ymm2, %eax testl %eax, %eax - jnz L(first_vec_x1_check) - xorl %eax, %eax - VZEROUPPER - ret + jnz L(last_vec_x2_return) - .p2align 4 -L(first_vec_x0_check): - tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) + vpmovmskb %ymm3, %eax + /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax +# ifdef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 2 - 1), %rdi +# else + subq $-(VEC_SIZE * 2 + 1), %rdi +# endif addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN +# ifndef USE_AS_RAWMEMCHR .p2align 4 L(first_vec_x1_check): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $VEC_SIZE, %rax + /* Adjust length. */ + subl $-(VEC_SIZE * 4), %edx + /* Check if match within remaining length. */ + cmpl %eax, %edx + jbe L(set_zero_end) + incq %rdi addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN + .p2align 4 +L(set_zero_end): + xorl %eax, %eax + VZEROUPPER_RETURN +# endif .p2align 4 -L(first_vec_x2_check): +L(last_vec_x1_return): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $(VEC_SIZE * 2), %rax +# ifdef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 4 - 1), %rdi +# else + incq %rdi +# endif addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 -L(first_vec_x3_check): +L(last_vec_x2_return): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $(VEC_SIZE * 3), %rax +# ifdef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 3 - 1), %rdi +# else + subq $-(VEC_SIZE + 1), %rdi +# endif addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN +# ifndef USE_AS_RAWMEMCHR .p2align 4 -L(zero): - VZEROUPPER -L(null): - xorl %eax, %eax - ret -# endif +L(last_4x_vec_or_less_cmpeq): + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %edx +# endif + subq $-(VEC_SIZE * 4), %rdi + /* Check first VEC regardless. */ + testl %eax, %eax + jnz L(first_vec_x1_check) + /* If remaining length <= CHAR_PER_VEC * 2. */ + addl $(VEC_SIZE * 2), %edx + jle L(last_2x_vec) .p2align 4 -L(first_vec_x0): - tzcntl %eax, %eax - addq %rdi, %rax - VZEROUPPER - ret +L(last_4x_vec): + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x2_return) - .p2align 4 -L(first_vec_x1): - tzcntl %eax, %eax - addq $VEC_SIZE, %rax - addq %rdi, %rax - VZEROUPPER - ret + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax - .p2align 4 -L(first_vec_x2): + /* Create mask for possible matches within remaining length. */ + movq $-1, %rcx + bzhiq %rdx, %rcx, %rcx + + /* Test matches in data against length match. */ + andl %ecx, %eax + jnz L(last_vec_x3) + + /* if remaining length <= VEC_SIZE * 3 (Note this is after + remaining length was found to be > VEC_SIZE * 2. */ + subl $VEC_SIZE, %edx + jbe L(zero_end2) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + /* Shift remaining length mask for last VEC. */ + shrq $32, %rcx + andl %ecx, %eax + jz L(zero_end2) tzcntl %eax, %eax - addq $(VEC_SIZE * 2), %rax + addq $(VEC_SIZE * 3 + 1), %rdi addq %rdi, %rax - VZEROUPPER - ret +L(zero_end2): + VZEROUPPER_RETURN .p2align 4 -L(4x_vec_end): - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(first_vec_x1) - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(first_vec_x2) - vpmovmskb %ymm4, %eax - testl %eax, %eax -L(first_vec_x3): +L(last_vec_x3): tzcntl %eax, %eax - addq $(VEC_SIZE * 3), %rax + subq $-(VEC_SIZE * 2 + 1), %rdi addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN +# endif END (MEMCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S new file mode 100644 index 0000000000..f3fdad4fda --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -0,0 +1,478 @@ +/* memchr/wmemchr optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifndef MEMCHR +# define MEMCHR __memchr_evex +# endif + +# ifdef USE_AS_WMEMCHR +# define VPBROADCAST vpbroadcastd +# define VPMINU vpminud +# define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd +# define CHAR_SIZE 4 +# else +# define VPBROADCAST vpbroadcastb +# define VPMINU vpminub +# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 +# endif + +# ifdef USE_AS_RAWMEMCHR +# define RAW_PTR_REG rcx +# define ALGN_PTR_REG rdi +# else +# define RAW_PTR_REG rdi +# define ALGN_PTR_REG rcx +# endif + +# define XMMZERO xmm23 +# define YMMZERO ymm23 +# define XMMMATCH xmm16 +# define YMMMATCH ymm16 +# define YMM1 ymm17 +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 + +# define VEC_SIZE 32 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) +# define PAGE_SIZE 4096 + + .section .text.evex,"ax",@progbits +ENTRY (MEMCHR) +# ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# endif + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + /* Check if we may cross page boundary with one vector load. */ + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. */ + VPCMP $0, (%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax +# ifndef USE_AS_RAWMEMCHR + /* If length < CHAR_PER_VEC handle special. */ + cmpq $CHAR_PER_VEC, %rdx + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif + ret + +# ifndef USE_AS_RAWMEMCHR +L(zero): + xorl %eax, %eax + ret + + .p2align 5 +L(first_vec_x0): + /* Check if first match was before length. */ + tzcntl %eax, %eax + xorl %ecx, %ecx + cmpl %eax, %edx + leaq (%rdi, %rax, CHAR_SIZE), %rax + cmovle %rcx, %rax + ret +# else + /* NB: first_vec_x0 is 17 bytes which will leave + cross_page_boundary (which is relatively cold) close enough + to ideal alignment. So only realign L(cross_page_boundary) if + rawmemchr. */ + .p2align 4 +# endif +L(cross_page_boundary): + /* Save pointer before aligning as its original value is + necessary for computer return address if byte is found or + adjusting length if it is not and this is memchr. */ + movq %rdi, %rcx + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi + for rawmemchr. */ + andq $-VEC_SIZE, %ALGN_PTR_REG + VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 + kmovd %k0, %r8d +# ifdef USE_AS_WMEMCHR + /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ + sarl $2, %eax +# endif +# ifndef USE_AS_RAWMEMCHR + movl $(PAGE_SIZE / CHAR_SIZE), %esi + subl %eax, %esi +# endif +# ifdef USE_AS_WMEMCHR + andl $(CHAR_PER_VEC - 1), %eax +# endif + /* Remove the leading bytes. */ + sarxl %eax, %r8d, %eax +# ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ + cmpq %rsi, %rdx + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(cross_page_continue) + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax +# else + addq %RAW_PTR_REG, %rax +# endif + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 5 +L(aligned_more): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + +# ifndef USE_AS_RAWMEMCHR + /* Align data to VEC_SIZE. */ +L(cross_page_continue): + xorl %ecx, %ecx + subl %edi, %ecx + andq $-VEC_SIZE, %rdi + /* esi is for adjusting length to see if near the end. */ + leal (VEC_SIZE * 5)(%rdi, %rcx), %esi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %esi +# endif +# else + andq $-VEC_SIZE, %rdi +L(cross_page_continue): +# endif + /* Load first VEC regardless. */ + VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax +# ifndef USE_AS_RAWMEMCHR + /* Adjust length. If near end handle specially. */ + subq %rsi, %rdx + jbe L(last_4x_vec_or_less) +# endif + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x4) + + +# ifndef USE_AS_RAWMEMCHR + /* Check if at last CHAR_PER_VEC * 4 length. */ + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_4x_vec_or_less_cmpeq) + addq $VEC_SIZE, %rdi + + /* Align data to VEC_SIZE * 4 for the loop and readjust length. + */ +# ifdef USE_AS_WMEMCHR + movl %edi, %ecx + andq $-(4 * VEC_SIZE), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx + addq %rcx, %rdx +# else + addq %rdi, %rdx + andq $-(4 * VEC_SIZE), %rdi + subq %rdi, %rdx +# endif +# else + addq $VEC_SIZE, %rdi + andq $-(4 * VEC_SIZE), %rdi +# endif + + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* Compare 4 * VEC at a time forward. */ + .p2align 4 +L(loop_4x_vec): + /* It would be possible to save some instructions using 4x VPCMP + but bottleneck on port 5 makes it not woth it. */ + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 + /* xor will set bytes match esi to zero. */ + vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ + VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} + VPCMP $0, %YMM3, %YMMZERO, %k2 +# ifdef USE_AS_RAWMEMCHR + subq $-(VEC_SIZE * 4), %rdi + kortestd %k2, %k3 + jz L(loop_4x_vec) +# else + kortestd %k2, %k3 + jnz L(loop_4x_vec_end) + + subq $-(VEC_SIZE * 4), %rdi + + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop_4x_vec) + + /* Fall through into less than 4 remaining vectors of length case. + */ + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + addq $(VEC_SIZE * 3), %rdi + .p2align 4 +L(last_4x_vec_or_less): + /* Check if first VEC contained match. */ + testl %eax, %eax + jnz L(first_vec_x1_check) + + /* If remaining length > CHAR_PER_VEC * 2. */ + addl $(CHAR_PER_VEC * 2), %edx + jg L(last_4x_vec) + +L(last_2x_vec): + /* If remaining length < CHAR_PER_VEC. */ + addl $CHAR_PER_VEC, %edx + jle L(zero_end) + + /* Check VEC2 and compare any match with remaining length. */ + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax + cmpl %eax, %edx + jbe L(set_zero_end) + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax +L(zero_end): + ret + + + .p2align 4 +L(first_vec_x1_check): + tzcntl %eax, %eax + /* Adjust length. */ + subl $-(CHAR_PER_VEC * 4), %edx + /* Check if match within remaining length. */ + cmpl %eax, %edx + jbe L(set_zero_end) + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax + ret +L(set_zero_end): + xorl %eax, %eax + ret + + .p2align 4 +L(loop_4x_vec_end): +# endif + /* rawmemchr will fall through into this if match was found in + loop. */ + + /* k1 has not of matches with VEC1. */ + kmovd %k1, %eax +# ifdef USE_AS_WMEMCHR + subl $((1 << CHAR_PER_VEC) - 1), %eax +# else + incl %eax +# endif + jnz L(last_vec_x1_return) + + VPCMP $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2_return) + + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x3_return) + + kmovd %k3, %eax + tzcntl %eax, %eax +# ifdef USE_AS_RAWMEMCHR + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax +# else + leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax +# endif + ret + + .p2align 4 +L(last_vec_x1_return): + tzcntl %eax, %eax +# ifdef USE_AS_RAWMEMCHR +# ifdef USE_AS_WMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif +# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax +# endif + ret + + .p2align 4 +L(last_vec_x2_return): + tzcntl %eax, %eax +# ifdef USE_AS_RAWMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax +# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax +# endif + ret + + .p2align 4 +L(last_vec_x3_return): + tzcntl %eax, %eax +# ifdef USE_AS_RAWMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax +# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax +# endif + ret + + +# ifndef USE_AS_RAWMEMCHR +L(last_4x_vec_or_less_cmpeq): + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + subq $-(VEC_SIZE * 4), %rdi + /* Check first VEC regardless. */ + testl %eax, %eax + jnz L(first_vec_x1_check) + + /* If remaining length <= CHAR_PER_VEC * 2. */ + addl $(CHAR_PER_VEC * 2), %edx + jle L(last_2x_vec) + + .p2align 4 +L(last_4x_vec): + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + /* Create mask for possible matches within remaining length. */ +# ifdef USE_AS_WMEMCHR + movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx + bzhil %edx, %ecx, %ecx +# else + movq $-1, %rcx + bzhiq %rdx, %rcx, %rcx +# endif + /* Test matches in data against length match. */ + andl %ecx, %eax + jnz L(last_vec_x3) + + /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after + remaining length was found to be > CHAR_PER_VEC * 2. */ + subl $CHAR_PER_VEC, %edx + jbe L(zero_end2) + + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + /* Shift remaining length mask for last VEC. */ +# ifdef USE_AS_WMEMCHR + shrl $CHAR_PER_VEC, %ecx +# else + shrq $CHAR_PER_VEC, %rcx +# endif + andl %ecx, %eax + jz L(zero_end2) + tzcntl %eax, %eax + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax +L(zero_end2): + ret + +L(last_vec_x2): + tzcntl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(last_vec_x3): + tzcntl %eax, %eax + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret +# endif + +END (MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000000..cf4eff5d4a --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCMP +# define MEMCMP __memcmp_avx2_movbe_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memcmp-avx2-movbe.S" diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index 67fc575b59..87f9478eaf 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -47,6 +47,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 # define VEC_MASK ((1 << VEC_SIZE) - 1) @@ -55,7 +59,7 @@ memcmp has to use UNSIGNED comparison for elemnts. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCMP) # ifdef USE_AS_WMEMCMP shl $2, %RDX_LP @@ -123,8 +127,8 @@ ENTRY (MEMCMP) vptest %ymm0, %ymm5 jnc L(4x_vec_end) xorl %eax, %eax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_2x_vec): @@ -144,8 +148,7 @@ L(last_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec): @@ -164,8 +167,7 @@ L(wmemcmp_return): movzbl (%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WMEMCMP .p2align 4 @@ -367,8 +369,7 @@ L(last_4x_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -394,8 +395,7 @@ L(4x_vec_end): movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -410,8 +410,7 @@ L(first_vec_x1): movzbl VEC_SIZE(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -426,7 +425,6 @@ L(first_vec_x2): movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMCMP) #endif diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S new file mode 100644 index 0000000000..9c093972e1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S @@ -0,0 +1,440 @@ +/* memcmp/wmemcmp optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +/* memcmp/wmemcmp is implemented as: + 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap + to avoid branches. + 2. Use overlapping compare to avoid branch. + 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 + bytes for wmemcmp. + 4. If size is 8 * VEC_SIZE or less, unroll the loop. + 5. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. + 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. + 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. + 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + +# include + +# ifndef MEMCMP +# define MEMCMP __memcmp_evex_movbe +# endif + +# define VMOVU vmovdqu64 + +# ifdef USE_AS_WMEMCMP +# define VPCMPEQ vpcmpeqd +# else +# define VPCMPEQ vpcmpeqb +# endif + +# define XMM1 xmm17 +# define XMM2 xmm18 +# define YMM1 ymm17 +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 + +# define VEC_SIZE 32 +# ifdef USE_AS_WMEMCMP +# define VEC_MASK 0xff +# define XMM_MASK 0xf +# else +# define VEC_MASK 0xffffffff +# define XMM_MASK 0xffff +# endif + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.evex,"ax",@progbits +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %RDX_LP +# elif defined __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k1 + kmovd %k1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_vec) + + /* More than 2 * VEC. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + + /* From 4 * VEC to 8 * VEC, inclusively. */ + VMOVU (%rsi), %YMM1 + VPCMPEQ (%rdi), %YMM1, %k1 + + VMOVU VEC_SIZE(%rsi), %YMM2 + VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + + kandd %k1, %k2, %k5 + kandd %k3, %k4, %k6 + kandd %k5, %k6, %k6 + + kmovd %k6, %eax + cmpl $VEC_MASK, %eax + jne L(4x_vec_end) + + leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi + VMOVU (%rsi), %YMM1 + VPCMPEQ (%rdi), %YMM1, %k1 + + VMOVU VEC_SIZE(%rsi), %YMM2 + VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + kandd %k1, %k2, %k5 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + kandd %k3, %k5, %k5 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + kandd %k4, %k5, %k5 + + kmovd %k5, %eax + cmpl $VEC_MASK, %eax + jne L(4x_vec_end) + xorl %eax, %eax + ret + + .p2align 4 +L(last_2x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + +L(last_vec): + /* Use overlapping loads to avoid branches. */ + leaq -VEC_SIZE(%rdi, %rdx), %rdi + leaq -VEC_SIZE(%rsi, %rdx), %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(first_vec): + /* A byte or int32 is different within 16 or 32 bytes. */ + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (%rdi, %rcx, 4), %edx + cmpl (%rsi, %rcx, 4), %edx +L(wmemcmp_return): + setl %al + negl %eax + orl $1, %eax +# else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret + +# ifdef USE_AS_WMEMCMP + .p2align 4 +L(4): + xorl %eax, %eax + movl (%rdi), %edx + cmpl (%rsi), %edx + jne L(wmemcmp_return) + ret +# else + .p2align 4 +L(between_4_7): + /* Load as big endian with overlapping movbe to avoid branches. */ + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + je L(exit) + sbbl %eax, %eax + orl $1, %eax + ret + + .p2align 4 +L(exit): + ret + + .p2align 4 +L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + shll $8, %eax + shll $8, %ecx + bswap %eax + bswap %ecx + movb -1(%rdi, %rdx), %al + movb -1(%rsi, %rdx), %cl + /* Subtraction is okay because the upper 8 bits are zero. */ + subl %ecx, %eax + ret + + .p2align 4 +L(1): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax + ret +# endif + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(less_vec): +# ifdef USE_AS_WMEMCMP + /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ + cmpb $4, %dl + je L(4) + jb L(zero) +# else + cmpb $1, %dl + je L(1) + jb L(zero) + cmpb $4, %dl + jb L(between_2_3) + cmpb $8, %dl + jb L(between_4_7) +# endif + cmpb $16, %dl + jae L(between_16_31) + /* It is between 8 and 15 bytes. */ + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 + VPCMPEQ %XMM1, %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + /* Use overlapping loads to avoid branches. */ + leaq -8(%rdi, %rdx), %rdi + leaq -8(%rsi, %rdx), %rsi + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 + VPCMPEQ %XMM1, %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + VMOVU (%rsi), %XMM2 + VPCMPEQ (%rdi), %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -16(%rdi, %rdx), %rdi + leaq -16(%rsi, %rdx), %rsi + VMOVU (%rsi), %XMM2 + VPCMPEQ (%rdi), %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(more_8x_vec): + /* More than 8 * VEC. Check the first VEC. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Align the first memory area for aligned loads in the loop. + Compute how much the first memory area is misaligned. */ + movq %rdi, %rcx + andl $(VEC_SIZE - 1), %ecx + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %rcx + /* Adjust the second memory area. */ + subq %rcx, %rsi + /* Adjust the first memory area which should be aligned now. */ + subq %rcx, %rdi + /* Adjust length. */ + addq %rcx, %rdx + +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + VMOVU (%rsi), %YMM1 + VPCMPEQ (%rdi), %YMM1, %k1 + + VMOVU VEC_SIZE(%rsi), %YMM2 + VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + kandd %k2, %k1, %k5 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + kandd %k3, %k5, %k5 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + kandd %k4, %k5, %k5 + + kmovd %k5, %eax + cmpl $VEC_MASK, %eax + jne L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rsi + + subq $(VEC_SIZE * 4), %rdx + cmpq $(VEC_SIZE * 4), %rdx + jae L(loop_4x_vec) + + /* Less than 4 * VEC. */ + cmpq $VEC_SIZE, %rdx + jbe L(last_vec) + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_2x_vec) + +L(last_4x_vec): + /* From 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(4x_vec_end): + kmovd %k1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x1) + kmovd %k3, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x2) + kmovd %k4, %eax + subl $VEC_MASK, %eax + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rcx, 4), %edx + cmpl VEC_SIZE(%rsi, %rcx, 4), %edx + jmp L(wmemcmp_return) +# else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx + cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret +END (MEMCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S new file mode 100644 index 0000000000..1ec1962e86 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S @@ -0,0 +1,17 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +# define VZEROUPPER_RETURN jmp L(return) + +# define SECTION(p) p##.avx.rtm +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S index aac1515cf6..848848ab39 100644 --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S @@ -1,11 +1,32 @@ #if IS_IN (libc) # define VEC_SIZE 64 -# define VEC(i) zmm##i +# define XMM0 xmm16 +# define XMM1 xmm17 +# define YMM0 ymm16 +# define YMM1 ymm17 +# define VEC0 zmm16 +# define VEC1 zmm17 +# define VEC2 zmm18 +# define VEC3 zmm19 +# define VEC4 zmm20 +# define VEC5 zmm21 +# define VEC6 zmm22 +# define VEC7 zmm23 +# define VEC8 zmm24 +# define VEC9 zmm25 +# define VEC10 zmm26 +# define VEC11 zmm27 +# define VEC12 zmm28 +# define VEC13 zmm29 +# define VEC14 zmm30 +# define VEC15 zmm31 +# define VEC(i) VEC##i # define VMOVNT vmovntdq # define VMOVU vmovdqu64 # define VMOVA vmovdqa64 +# define VZEROUPPER -# define SECTION(p) p##.avx512 +# define SECTION(p) p##.evex512 # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s # include "memmove-vec-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S new file mode 100644 index 0000000000..0cbce8f944 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S @@ -0,0 +1,33 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define XMM0 xmm16 +# define XMM1 xmm17 +# define YMM0 ymm16 +# define YMM1 ymm17 +# define VEC0 ymm16 +# define VEC1 ymm17 +# define VEC2 ymm18 +# define VEC3 ymm19 +# define VEC4 ymm20 +# define VEC5 ymm21 +# define VEC6 ymm22 +# define VEC7 ymm23 +# define VEC8 ymm24 +# define VEC9 ymm25 +# define VEC10 ymm26 +# define VEC11 ymm27 +# define VEC12 ymm28 +# define VEC13 ymm29 +# define VEC14 ymm30 +# define VEC15 ymm31 +# define VEC(i) VEC##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 +# define VZEROUPPER + +# define SECTION(p) p##.evex +# define MEMMOVE_SYMBOL(p,s) p##_evex_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index bd5dc1a3f3..f71c343ecb 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -48,6 +48,14 @@ # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) #endif +#ifndef XMM0 +# define XMM0 xmm0 +#endif + +#ifndef YMM0 +# define YMM0 ymm0 +#endif + #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -56,6 +64,13 @@ # endif #endif +/* Avoid short distance rep movsb only with non-SSE vector. */ +#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB +# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) +#else +# define AVOID_SHORT_DISTANCE_REP_MOVSB 0 +#endif + #ifndef PREFETCH # define PREFETCH(addr) prefetcht0 addr #endif @@ -132,11 +147,12 @@ L(last_2x_vec): VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) - VZEROUPPER #if !defined USE_MULTIARCH || !IS_IN (libc) L(nop): -#endif ret +#else + VZEROUPPER_RETURN +#endif #if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned)) @@ -229,8 +245,11 @@ L(last_2x_vec): VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(movsb): cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP @@ -243,7 +262,21 @@ L(movsb): cmpq %r9, %rdi /* Avoid slow backward REP MOVSB. */ jb L(more_8x_vec_backward) +# if AVOID_SHORT_DISTANCE_REP_MOVSB + movq %rdi, %rcx + subq %rsi, %rcx + jmp 2f +# endif 1: +# if AVOID_SHORT_DISTANCE_REP_MOVSB + movq %rsi, %rcx + subq %rdi, %rcx +2: +/* Avoid "rep movsb" if RCX, the distance between source and destination, + is N*4GB + [1..63] with N >= 0. */ + cmpl $63, %ecx + jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ +# endif mov %RDX_LP, %RCX_LP rep movsb L(nop): @@ -277,21 +310,20 @@ L(less_vec): #if VEC_SIZE > 32 L(between_32_63): /* From 32 to 63. No branch when size == 32. */ - vmovdqu (%rsi), %ymm0 - vmovdqu -32(%rsi,%rdx), %ymm1 - vmovdqu %ymm0, (%rdi) - vmovdqu %ymm1, -32(%rdi,%rdx) - VZEROUPPER - ret + VMOVU (%rsi), %YMM0 + VMOVU -32(%rsi,%rdx), %YMM1 + VMOVU %YMM0, (%rdi) + VMOVU %YMM1, -32(%rdi,%rdx) + VZEROUPPER_RETURN #endif #if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): - vmovdqu (%rsi), %xmm0 - vmovdqu -16(%rsi,%rdx), %xmm1 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, -16(%rdi,%rdx) - ret + VMOVU (%rsi), %XMM0 + VMOVU -16(%rsi,%rdx), %XMM1 + VMOVU %XMM0, (%rdi) + VMOVU %XMM1, -16(%rdi,%rdx) + VZEROUPPER_RETURN #endif L(between_8_15): /* From 8 to 15. No branch when size == 8. */ @@ -344,8 +376,7 @@ L(more_2x_vec): VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(last_4x_vec): /* Copy from 2 * VEC to 4 * VEC. */ VMOVU (%rsi), %VEC(0) @@ -356,8 +387,7 @@ L(last_4x_vec): VMOVU %VEC(1), VEC_SIZE(%rdi) VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec): cmpq %rsi, %rdi @@ -413,8 +443,7 @@ L(loop_4x_vec_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec_backward): /* Load the first 4 * VEC and last VEC to support overlapping @@ -465,8 +494,7 @@ L(loop_4x_vec_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) L(large_forward): @@ -501,8 +529,7 @@ L(loop_large_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(large_backward): /* Don't use non-temporal store if there is overlap between @@ -536,8 +563,7 @@ L(loop_large_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S new file mode 100644 index 0000000000..cea2d2a72d --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index f5437b54de..c8d54c08d6 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -20,14 +20,22 @@ # include +# ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2 +# endif + # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits -ENTRY (__memrchr_avx2) + .section SECTION(.text),"ax",@progbits +ENTRY (MEMRCHR) /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 vpbroadcastb %xmm0, %ymm0 @@ -134,8 +142,8 @@ L(loop_4x_vec): vpmovmskb %ymm1, %eax bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_4x_vec_or_less): @@ -169,8 +177,7 @@ L(last_4x_vec_or_less): addq %rax, %rdx jl L(zero) addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -191,31 +198,27 @@ L(last_2x_vec): jl L(zero) addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x0): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x1): bsrl %eax, %eax addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x2): bsrl %eax, %eax addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3): @@ -232,8 +235,7 @@ L(last_vec_x1_check): jl L(zero) addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3_check): @@ -243,12 +245,14 @@ L(last_vec_x3_check): jl L(zero) addl $(VEC_SIZE * 3), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): - VZEROUPPER + xorl %eax, %eax + VZEROUPPER_RETURN + + .p2align 4 L(null): xorl %eax, %eax ret @@ -273,8 +277,7 @@ L(last_vec_or_less_aligned): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_or_less): @@ -315,8 +318,7 @@ L(last_vec_or_less): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_2x_aligned): @@ -353,7 +355,6 @@ L(last_vec_2x_aligned): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret -END (__memrchr_avx2) + VZEROUPPER_RETURN +END (MEMRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S new file mode 100644 index 0000000000..16bf8e02b1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S @@ -0,0 +1,337 @@ +/* memrchr optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# define VMOVA vmovdqa64 + +# define YMMMATCH ymm16 + +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (__memrchr_evex) + /* Broadcast CHAR to YMMMATCH. */ + vpbroadcastb %esi, %YMMMATCH + + sub $VEC_SIZE, %RDX_LP + jbe L(last_vec_or_less) + + add %RDX_LP, %RDI_LP + + /* Check the last VEC_SIZE bytes. */ + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + subq $(VEC_SIZE * 4), %rdi + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(aligned_more) + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rdx + andq $-VEC_SIZE, %rdi + subq %rcx, %rdx + + .p2align 4 +L(aligned_more): + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 + kmovd %k3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + vpcmpb $0, (%rdi), %YMMMATCH, %k4 + kmovd %k4, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + /* Align data to 4 * VEC_SIZE for loop with fewer branches. + There are some overlaps with above if data isn't aligned + to 4 * VEC_SIZE. */ + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx + jz L(loop_4x_vec) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rdx + andq $-(VEC_SIZE * 4), %rdi + subq %rcx, %rdx + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + subq $(VEC_SIZE * 4), %rdi + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 + kord %k1, %k2, %k5 + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 + + kord %k3, %k4, %k6 + kortestd %k5, %k6 + jz L(loop_4x_vec) + + /* There is a match. */ + kmovd %k4, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + kmovd %k3, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + kmovd %k1, %eax + bsrl %eax, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_4x_vec_or_less): + addl $(VEC_SIZE * 4), %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 + kmovd %k3, %eax + testl %eax, %eax + jnz L(last_vec_x1_check) + cmpl $(VEC_SIZE * 3), %edx + jbe L(zero) + + vpcmpb $0, (%rdi), %YMMMATCH, %k4 + kmovd %k4, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 4), %rdx + addq %rax, %rdx + jl L(zero) + addq %rdi, %rax + ret + + .p2align 4 +L(last_2x_vec): + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x3_check) + cmpl $VEC_SIZE, %edx + jbe L(zero) + + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 2), %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x0): + bsrl %eax, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x1): + bsrl %eax, %eax + addl $VEC_SIZE, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x2): + bsrl %eax, %eax + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x3): + bsrl %eax, %eax + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x1_check): + bsrl %eax, %eax + subq $(VEC_SIZE * 3), %rdx + addq %rax, %rdx + jl L(zero) + addl $VEC_SIZE, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x3_check): + bsrl %eax, %eax + subq $VEC_SIZE, %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(last_vec_or_less_aligned): + movl %edx, %ecx + + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + + movl $1, %edx + /* Support rdx << 32. */ + salq %cl, %rdx + subq $1, %rdx + + kmovd %k1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_or_less): + addl $VEC_SIZE, %edx + + /* Check for zero length. */ + testl %edx, %edx + jz L(zero) + + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(last_vec_or_less_aligned) + + movl %ecx, %esi + movl %ecx, %r8d + addl %edx, %esi + andq $-VEC_SIZE, %rdi + + subl $VEC_SIZE, %esi + ja L(last_vec_2x_aligned) + + /* Check the last VEC. */ + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + + /* Remove the leading and trailing bytes. */ + sarl %cl, %eax + movl %edx, %ecx + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + ret + + .p2align 4 +L(last_vec_2x_aligned): + movl %esi, %ecx + + /* Check the last VEC. */ + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + kmovd %k1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + + testl %eax, %eax + jnz L(last_vec_x1) + + /* Check the second last VEC. */ + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + + movl %r8d, %ecx + + kmovd %k1, %eax + + /* Remove the leading bytes. Must use unsigned right shift for + bsrl below. */ + shrl %cl, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + ret +END (__memrchr_evex) +#endif diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S new file mode 100644 index 0000000000..8ac3e479bb --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S @@ -0,0 +1,10 @@ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return) + +#define SECTION(p) p##.avx.rtm +#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm +#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + +#include "memset-avx2-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 7ab3d89849..ae0860f36a 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -14,9 +14,15 @@ movq r, %rax; \ vpbroadcastd %xmm0, %ymm0 -# define SECTION(p) p##.avx -# define MEMSET_SYMBOL(p,s) p##_avx2_##s -# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# ifndef SECTION +# define SECTION(p) p##.avx +# endif +# ifndef MEMSET_SYMBOL +# define MEMSET_SYMBOL(p,s) p##_avx2_##s +# endif +# ifndef WMEMSET_SYMBOL +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# endif # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index 0783979ca5..22e7b187c8 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -1,22 +1,22 @@ #if IS_IN (libc) # define VEC_SIZE 64 -# define VEC(i) zmm##i +# define XMM0 xmm16 +# define YMM0 ymm16 +# define VEC0 zmm16 +# define VEC(i) VEC##i # define VMOVU vmovdqu64 # define VMOVA vmovdqa64 +# define VZEROUPPER # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - vmovd d, %xmm0; \ movq r, %rax; \ - vpbroadcastb %xmm0, %xmm0; \ - vpbroadcastq %xmm0, %zmm0 + vpbroadcastb d, %VEC0 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - vmovd d, %xmm0; \ movq r, %rax; \ - vpbroadcastd %xmm0, %xmm0; \ - vpbroadcastq %xmm0, %zmm0 + vpbroadcastd d, %VEC0 -# define SECTION(p) p##.avx512 +# define SECTION(p) p##.evex512 # define MEMSET_SYMBOL(p,s) p##_avx512_##s # define WMEMSET_SYMBOL(p,s) p##_avx512_##s diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S new file mode 100644 index 0000000000..ae0a4d6e46 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -0,0 +1,24 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define XMM0 xmm16 +# define YMM0 ymm16 +# define VEC0 ymm16 +# define VEC(i) VEC##i +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 +# define VZEROUPPER + +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movq r, %rax; \ + vpbroadcastb d, %VEC0 + +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movq r, %rax; \ + vpbroadcastd d, %VEC0 + +# define SECTION(p) p##.evex +# define MEMSET_SYMBOL(p,s) p##_evex_##s +# define WMEMSET_SYMBOL(p,s) p##_evex_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 2bfc95de05..de5a8a38f5 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -34,20 +34,25 @@ # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) #endif +#ifndef XMM0 +# define XMM0 xmm0 +#endif + +#ifndef YMM0 +# define YMM0 ymm0 +#endif + #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper +# define VZEROUPPER_SHORT_RETURN vzeroupper; ret # else # define VZEROUPPER # endif #endif #ifndef VZEROUPPER_SHORT_RETURN -# if VEC_SIZE > 16 -# define VZEROUPPER_SHORT_RETURN vzeroupper -# else -# define VZEROUPPER_SHORT_RETURN rep -# endif +# define VZEROUPPER_SHORT_RETURN rep; ret #endif #ifndef MOVQ @@ -67,7 +72,7 @@ ENTRY (__bzero) mov %RDI_LP, %RAX_LP /* Set return value. */ mov %RSI_LP, %RDX_LP /* Set n. */ - pxor %xmm0, %xmm0 + pxor %XMM0, %XMM0 jmp L(entry_from_bzero) END (__bzero) weak_alias (__bzero, bzero) @@ -109,8 +114,7 @@ L(entry_from_bzero): /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -133,14 +137,12 @@ ENTRY (__memset_erms) ENTRY (MEMSET_SYMBOL (__memset, erms)) # endif L(stosb): - /* Issue vzeroupper before rep stosb. */ - VZEROUPPER mov %RDX_LP, %RCX_LP movzbl %sil, %eax mov %RDI_LP, %RDX_LP rep stosb mov %RDX_LP, %RAX_LP - ret + VZEROUPPER_RETURN # if VEC_SIZE == 16 END (__memset_erms) # else @@ -167,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(stosb_more_2x_vec): cmp __x86_rep_stosb_threshold(%rip), %RDX_LP @@ -182,8 +183,11 @@ L(more_2x_vec): VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(loop_start): leaq (VEC_SIZE * 4)(%rdi), %rcx @@ -209,7 +213,6 @@ L(loop): cmpq %rcx, %rdx jne L(loop) VZEROUPPER_SHORT_RETURN - ret L(less_vec): /* Less than 1 VEC. */ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 @@ -223,7 +226,7 @@ L(less_vec): cmpb $16, %dl jae L(between_16_31) # endif - MOVQ %xmm0, %rcx + MOVQ %XMM0, %rcx cmpb $8, %dl jae L(between_8_15) cmpb $4, %dl @@ -233,40 +236,34 @@ L(less_vec): jb 1f movb %cl, (%rdi) 1: - VZEROUPPER - ret + VZEROUPPER_RETURN # if VEC_SIZE > 32 /* From 32 to 63. No branch when size == 32. */ L(between_32_63): - vmovdqu %ymm0, -32(%rdi,%rdx) - vmovdqu %ymm0, (%rdi) - VZEROUPPER - ret + VMOVU %YMM0, -32(%rdi,%rdx) + VMOVU %YMM0, (%rdi) + VZEROUPPER_RETURN # endif # if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): - vmovdqu %xmm0, -16(%rdi,%rdx) - vmovdqu %xmm0, (%rdi) - VZEROUPPER - ret + VMOVU %XMM0, -16(%rdi,%rdx) + VMOVU %XMM0, (%rdi) + VZEROUPPER_RETURN # endif /* From 8 to 15. No branch when size == 8. */ L(between_8_15): movq %rcx, -8(%rdi,%rdx) movq %rcx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_4_7): /* From 4 to 7. No branch when size == 4. */ movl %ecx, -4(%rdi,%rdx) movl %ecx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_2_3): /* From 2 to 3. No branch when size == 2. */ movw %cx, -2(%rdi,%rdx) movw %cx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S new file mode 100644 index 0000000000..acc5f6e2fb --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __rawmemchr_avx2_rtm +#define USE_AS_RAWMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S new file mode 100644 index 0000000000..ec942b77ba --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S @@ -0,0 +1,4 @@ +#define MEMCHR __rawmemchr_evex +#define USE_AS_RAWMEMCHR 1 + +#include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S new file mode 100644 index 0000000000..2b9c07a59f --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S new file mode 100644 index 0000000000..7c6f26cd98 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_evex +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S new file mode 100644 index 0000000000..60a2ccfe53 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S new file mode 100644 index 0000000000..1570014d1c --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_evex +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S new file mode 100644 index 0000000000..637fb557c4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCAT +# define STRCAT __strcat_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcat-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S index a4143bf8f5..1e6d4827ee 100644 --- a/sysdeps/x86_64/multiarch/strcat-avx2.S +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S @@ -30,7 +30,11 @@ /* Number of bytes in a vector register */ # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + + .section SECTION(.text),"ax",@progbits ENTRY (STRCAT) mov %rdi, %r9 # ifdef USE_AS_STRNCAT diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S new file mode 100644 index 0000000000..97c3d85b6d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-evex.S @@ -0,0 +1,283 @@ +/* strcat with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifndef STRCAT +# define STRCAT __strcat_evex +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +/* zero register */ +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMM0 ymm17 +# define YMM1 ymm18 + +# define USE_AS_STRCAT + +/* Number of bytes in a vector register */ +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (STRCAT) + mov %rdi, %r9 +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + + xor %eax, %eax + mov %edi, %ecx + and $((VEC_SIZE * 4) - 1), %ecx + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + cmp $(VEC_SIZE * 3), %ecx + ja L(fourth_vector_boundary) + vpcmpb $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %edx + test %edx, %edx + jnz L(exit_null_on_first_vector) + mov %rdi, %rax + and $-VEC_SIZE, %rax + jmp L(align_vec_size_start) +L(fourth_vector_boundary): + mov %rdi, %rax + and $-VEC_SIZE, %rax + vpcmpb $0, (%rax), %YMMZERO, %k0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + kmovd %k0, %edx + and %r10d, %edx + jnz L(exit) + +L(align_vec_size_start): + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 + kmovd %k0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + add $(VEC_SIZE * 4), %rax + kmovd %k4, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + kmovd %k4, %edx + add $(VEC_SIZE * 4), %rax + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + add $(VEC_SIZE * 4), %rax + kmovd %k4, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + add $(VEC_SIZE * 5), %rax + kmovd %k4, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 + add $VEC_SIZE, %rax + kmovd %k0, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 + add $VEC_SIZE, %rax + kmovd %k0, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 + add $VEC_SIZE, %rax + kmovd %k1, %edx + test %edx, %edx + jnz L(exit) + + add $VEC_SIZE, %rax + + .p2align 4 +L(align_four_vec_loop): + VMOVA (%rax), %YMM0 + VMOVA (VEC_SIZE * 2)(%rax), %YMM1 + vpminub VEC_SIZE(%rax), %YMM0, %YMM0 + vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 + vpminub %YMM0, %YMM1, %YMM0 + /* If K0 != 0, there is a null byte. */ + vpcmpb $0, %YMM0, %YMMZERO, %k0 + add $(VEC_SIZE * 4), %rax + ktestd %k0, %k0 + jz L(align_four_vec_loop) + + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 + sub $(VEC_SIZE * 5), %rax + kmovd %k0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 4), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_null_on_first_vector): + bsf %rdx, %rdx + add %rdx, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_second_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $VEC_SIZE, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_third_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 2), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_fourth_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 3), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_fifth_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 4), %rax + + .p2align 4 +L(StartStrcpyPart): + lea (%r9, %rax), %rdi + mov %rsi, %rcx + mov %r9, %rax /* save result */ + +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(ExitZero) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-evex.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S new file mode 100644 index 0000000000..81f20d1d8e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCHR +# define STRCHR __strchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S index 39fc69da7b..0a5217514a 100644 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -38,9 +38,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCHR) movl %edi, %ecx /* Broadcast CHAR to YMM0. */ @@ -93,8 +97,8 @@ L(cros_page_boundary): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(aligned_more): @@ -190,8 +194,7 @@ L(first_vec_x0): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -205,8 +208,7 @@ L(first_vec_x1): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -220,8 +222,7 @@ L(first_vec_x2): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -247,8 +248,7 @@ L(first_vec_x3): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S new file mode 100644 index 0000000000..ddc86a7058 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-evex.S @@ -0,0 +1,335 @@ +/* strchr/strchrnul optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifndef STRCHR +# define STRCHR __strchr_evex +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# ifdef USE_AS_WCSCHR +# define VPBROADCAST vpbroadcastd +# define VPCMP vpcmpd +# define VPMINU vpminud +# define CHAR_REG esi +# define SHIFT_REG r8d +# else +# define VPBROADCAST vpbroadcastb +# define VPCMP vpcmpb +# define VPMINU vpminub +# define CHAR_REG sil +# define SHIFT_REG ecx +# endif + +# define XMMZERO xmm16 + +# define YMMZERO ymm16 +# define YMM0 ymm17 +# define YMM1 ymm18 +# define YMM2 ymm19 +# define YMM3 ymm20 +# define YMM4 ymm21 +# define YMM5 ymm22 +# define YMM6 ymm23 +# define YMM7 ymm24 +# define YMM8 ymm25 + +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 + + .section .text.evex,"ax",@progbits +ENTRY (STRCHR) + movl %edi, %ecx +# ifndef USE_AS_STRCHRNUL + xorl %edx, %edx +# endif + + /* Broadcast CHAR to YMM0. */ + VPBROADCAST %esi, %YMM0 + + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* Check if we cross page boundary with one vector load. */ + andl $(PAGE_SIZE - 1), %ecx + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null bytes. */ + VMOVU (%rdi), %YMM1 + + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + ktestd %k0, %k0 + jz L(more_vecs) + kmovd %k0, %eax + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (%rdi, %rax, 4), %rax +# else + addq %rdi, %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + .p2align 4 +L(more_vecs): + /* Align data for aligned loads in the loop. */ + andq $-VEC_SIZE, %rdi +L(aligned_more): + + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + VMOVA VEC_SIZE(%rdi), %YMM1 + addq $VEC_SIZE, %rdi + + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VMOVA VEC_SIZE(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + ktestd %k0, %k0 + jz L(prep_loop_4x) + + kmovd %k0, %eax + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax +# else + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + .p2align 4 +L(first_vec_x0): + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (%rdi, %rax, 4), %rax +# else + addq %rdi, %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq VEC_SIZE(%rdi, %rax, 4), %rax +# else + leaq VEC_SIZE(%rdi, %rax), %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax +# else + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + +L(prep_loop_4x): + /* Align data to 4 * VEC_SIZE. */ + andq $-(VEC_SIZE * 4), %rdi + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 + VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 + VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 + + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM5 + vpxorq %YMM2, %YMM0, %YMM6 + vpxorq %YMM3, %YMM0, %YMM7 + vpxorq %YMM4, %YMM0, %YMM8 + + VPMINU %YMM5, %YMM1, %YMM5 + VPMINU %YMM6, %YMM2, %YMM6 + VPMINU %YMM7, %YMM3, %YMM7 + VPMINU %YMM8, %YMM4, %YMM8 + + VPMINU %YMM5, %YMM6, %YMM1 + VPMINU %YMM7, %YMM8, %YMM2 + + VPMINU %YMM1, %YMM2, %YMM1 + + /* Each bit in K0 represents a CHAR or a null byte. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + + addq $(VEC_SIZE * 4), %rdi + + ktestd %k0, %k0 + jz L(loop_4x_vec) + + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM5, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ + VPCMP $0, %YMMZERO, %YMM6, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ + VPCMP $0, %YMMZERO, %YMM7, %k2 + /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ + VPCMP $0, %YMMZERO, %YMM8, %k3 + +# ifdef USE_AS_WCSCHR + /* NB: Each bit in K2/K3 represents 4-byte element. */ + kshiftlw $8, %k3, %k1 +# else + kshiftlq $32, %k3, %k1 +# endif + + /* Each bit in K1 represents a NULL or a mismatch. */ + korq %k1, %k2, %k1 + kmovq %k1, %rax + + tzcntq %rax, %rax +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax +# else + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + /* Cold case for crossing page with first load. */ + .p2align 4 +L(cross_page_boundary): + andq $-VEC_SIZE, %rdi + andl $(VEC_SIZE - 1), %ecx + + VMOVA (%rdi), %YMM1 + + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + +# ifdef USE_AS_WCSCHR + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG + sarl $2, %SHIFT_REG +# endif + + /* Remove the leading bits. */ + sarxl %SHIFT_REG, %eax, %eax + testl %eax, %eax + + jz L(aligned_more) + tzcntl %eax, %eax + addq %rcx, %rdi +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (%rdi, %rax, 4), %rax +# else + addq %rdi, %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + +END (STRCHR) +# endif diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c index 8df4609bf8..4ed1177c70 100644 --- a/sysdeps/x86_64/multiarch/strchr.c +++ b/sysdeps/x86_64/multiarch/strchr.c @@ -29,16 +29,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF)) return OPTIMIZE (sse2_no_bsf); diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S new file mode 100644 index 0000000000..cdcf818b91 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __strchrnul_avx2_rtm +#define USE_AS_STRCHRNUL 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S new file mode 100644 index 0000000000..064fe7ca9e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S @@ -0,0 +1,3 @@ +#define STRCHR __strchrnul_evex +#define USE_AS_STRCHRNUL 1 +#include "strchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S new file mode 100644 index 0000000000..aecd30d97f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCMP +# define STRCMP __strcmp_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S index d42b04b54f..759e5b64c2 100644 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -55,6 +55,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + /* Warning! wcscmp/wcsncmp have to use SIGNED comparison for elements. strcmp/strncmp have to use UNSIGNED comparison for elements. @@ -75,7 +79,7 @@ the maximum offset is reached before a difference is found, zero is returned. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCMP) # ifdef USE_AS_STRNCMP /* Check for simple cases (0 or 1) in offset. */ @@ -83,6 +87,16 @@ ENTRY (STRCMP) je L(char0) jb L(zero) # ifdef USE_AS_WCSCMP +# ifndef __ILP32__ + movq %rdx, %rcx + /* Check if length could overflow when multiplied by + sizeof(wchar_t). Checking top 8 bits will cover all potential + overflow cases as well as redirect cases where its impossible to + length to bound a valid memory region. In these cases just use + 'wcscmp'. */ + shrq $56, %rcx + jnz __wcscmp_avx2 +# endif /* Convert units: from wide to byte char. */ shl $2, %RDX_LP # endif @@ -127,8 +141,8 @@ L(return): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(return_vec_size): @@ -161,8 +175,7 @@ L(return_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_2_vec_size): @@ -195,8 +208,7 @@ L(return_2_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_3_vec_size): @@ -229,8 +241,7 @@ L(return_3_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(next_3_vectors): @@ -356,8 +367,7 @@ L(back_to_loop): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_vec): @@ -400,8 +410,7 @@ L(test_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_2_vec): @@ -444,8 +453,7 @@ L(test_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_3_vec): @@ -486,8 +494,7 @@ L(test_3_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page): @@ -556,8 +563,7 @@ L(loop_cross_page): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page_2_vec): @@ -631,8 +637,7 @@ L(loop_cross_page_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_STRNCMP L(string_nbyte_offset_check): @@ -674,8 +679,7 @@ L(cross_page_loop): # ifndef USE_AS_WCSCMP L(different): # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WCSCMP .p2align 4 @@ -685,16 +689,14 @@ L(different): setl %al negl %eax orl $1, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN # endif # ifdef USE_AS_STRNCMP .p2align 4 L(zero): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char0): @@ -708,8 +710,7 @@ L(char0): movzbl (%rdi), %eax subl %ecx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # endif .p2align 4 @@ -734,8 +735,7 @@ L(last_vector): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN /* Comparing on page boundary region requires special treatment: It must done one vector at the time, starting with the wider @@ -856,7 +856,6 @@ L(cross_page_4bytes): testl %eax, %eax jne L(cross_page_loop) subl %ecx, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCMP) #endif diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S new file mode 100644 index 0000000000..459eeed09f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S @@ -0,0 +1,1043 @@ +/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifndef STRCMP +# define STRCMP __strcmp_evex +# endif + +# define PAGE_SIZE 4096 + +/* VEC_SIZE = Number of bytes in a ymm register */ +# define VEC_SIZE 32 + +/* Shift for dividing by (VEC_SIZE * 4). */ +# define DIVIDE_BY_VEC_4_SHIFT 7 +# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# ifdef USE_AS_WCSCMP +/* Compare packed dwords. */ +# define VPCMP vpcmpd +# define SHIFT_REG32 r8d +# define SHIFT_REG64 r8 +/* 1 dword char == 4 bytes. */ +# define SIZE_OF_CHAR 4 +# else +/* Compare packed bytes. */ +# define VPCMP vpcmpb +# define SHIFT_REG32 ecx +# define SHIFT_REG64 rcx +/* 1 byte char == 1 byte. */ +# define SIZE_OF_CHAR 1 +# endif + +# define XMMZERO xmm16 +# define XMM0 xmm17 +# define XMM1 xmm18 + +# define YMMZERO ymm16 +# define YMM0 ymm17 +# define YMM1 ymm18 +# define YMM2 ymm19 +# define YMM3 ymm20 +# define YMM4 ymm21 +# define YMM5 ymm22 +# define YMM6 ymm23 +# define YMM7 ymm24 + +/* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. + strcmp/strncmp have to use UNSIGNED comparison for elements. +*/ + +/* The main idea of the string comparison (byte or dword) using 256-bit + EVEX instructions consists of comparing (VPCMP) two ymm vectors. The + latter can be on either packed bytes or dwords depending on + USE_AS_WCSCMP. In order to check the null char, algorithm keeps the + matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 + KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) + are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd + instructions. Main loop (away from from page boundary) compares 4 + vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 + bytes) on each loop. + + The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic + is the same as strcmp, except that an a maximum offset is tracked. If + the maximum offset is reached before a difference is found, zero is + returned. */ + + .section .text.evex,"ax",@progbits +ENTRY (STRCMP) +# ifdef USE_AS_STRNCMP + /* Check for simple cases (0 or 1) in offset. */ + cmp $1, %RDX_LP + je L(char0) + jb L(zero) +# ifdef USE_AS_WCSCMP + /* Convert units: from wide to byte char. */ + shl $2, %RDX_LP +# endif + /* Register %r11 tracks the maximum offset. */ + mov %RDX_LP, %R11_LP +# endif + movl %edi, %eax + xorl %edx, %edx + /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + orl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax + jg L(cross_page) + /* Start comparing 4 vectors. */ + VMOVU (%rdi), %YMM0 + VMOVU (%rsi), %YMM1 + + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ + VPCMP $4, %YMM0, %YMM1, %k0 + + /* Check for NULL in YMM0. */ + VPCMP $0, %YMMZERO, %YMM0, %k1 + /* Check for NULL in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k2 + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ + kord %k1, %k2, %k1 + + /* Each bit in K1 represents: + 1. A mismatch in YMM0 and YMM1. Or + 2. A NULL in YMM0 or YMM1. + */ + kord %k0, %k1, %k1 + + ktestd %k1, %k1 + je L(next_3_vectors) + kmovd %k1, %ecx + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx) is after the maximum + offset (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + je L(return) +L(wcscmp_return): + setl %al + negl %eax + orl $1, %eax +L(return): +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif + ret + + .p2align 4 +L(return_vec_size): + kmovd %k1, %ecx + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after + the maximum offset (%r11). */ + addq $VEC_SIZE, %rdx + cmpq %r11, %rdx + jae L(zero) +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rdx), %ecx + cmpl VEC_SIZE(%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl VEC_SIZE(%rdi, %rdx), %eax + movzbl VEC_SIZE(%rsi, %rdx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(return_2_vec_size): + kmovd %k1, %ecx + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is + after the maximum offset (%r11). */ + addq $(VEC_SIZE * 2), %rdx + cmpq %r11, %rdx + jae L(zero) +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx + cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(return_3_vec_size): + kmovd %k1, %ecx + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is + after the maximum offset (%r11). */ + addq $(VEC_SIZE * 3), %rdx + cmpq %r11, %rdx + jae L(zero) +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx + cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(next_3_vectors): + VMOVU VEC_SIZE(%rdi), %YMM0 + VMOVU VEC_SIZE(%rsi), %YMM1 + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ + VPCMP $4, %YMM0, %YMM1, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM1, %k2 + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + ktestd %k1, %k1 + jne L(return_vec_size) + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM2 + VMOVU (VEC_SIZE * 3)(%rdi), %YMM3 + VMOVU (VEC_SIZE * 2)(%rsi), %YMM4 + VMOVU (VEC_SIZE * 3)(%rsi), %YMM5 + + /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */ + VPCMP $4, %YMM2, %YMM4, %k0 + VPCMP $0, %YMMZERO, %YMM2, %k1 + VPCMP $0, %YMMZERO, %YMM4, %k2 + /* Each bit in K1 represents a NULL in YMM2 or YMM4. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + ktestd %k1, %k1 + jne L(return_2_vec_size) + + /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */ + VPCMP $4, %YMM3, %YMM5, %k0 + VPCMP $0, %YMMZERO, %YMM3, %k1 + VPCMP $0, %YMMZERO, %YMM5, %k2 + /* Each bit in K1 represents a NULL in YMM3 or YMM5. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + ktestd %k1, %k1 + jne L(return_3_vec_size) +L(main_loop_header): + leaq (VEC_SIZE * 4)(%rdi), %rdx + movl $PAGE_SIZE, %ecx + /* Align load via RAX. */ + andq $-(VEC_SIZE * 4), %rdx + subq %rdi, %rdx + leaq (%rdi, %rdx), %rax +# ifdef USE_AS_STRNCMP + /* Starting from this point, the maximum offset, or simply the + 'offset', DECREASES by the same amount when base pointers are + moved forward. Return 0 when: + 1) On match: offset <= the matched vector index. + 2) On mistmach, offset is before the mistmatched index. + */ + subq %rdx, %r11 + jbe L(zero) +# endif + addq %rsi, %rdx + movq %rdx, %rsi + andl $(PAGE_SIZE - 1), %esi + /* Number of bytes before page crossing. */ + subq %rsi, %rcx + /* Number of VEC_SIZE * 4 blocks before page crossing. */ + shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx + /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ + movl %ecx, %esi + jmp L(loop_start) + + .p2align 4 +L(loop): +# ifdef USE_AS_STRNCMP + /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease + the maximum offset (%r11) by the same amount. */ + subq $(VEC_SIZE * 4), %r11 + jbe L(zero) +# endif + addq $(VEC_SIZE * 4), %rax + addq $(VEC_SIZE * 4), %rdx +L(loop_start): + testl %esi, %esi + leal -1(%esi), %esi + je L(loop_cross_page) +L(back_to_loop): + /* Main loop, comparing 4 vectors are a time. */ + VMOVA (%rax), %YMM0 + VMOVA VEC_SIZE(%rax), %YMM2 + VMOVA (VEC_SIZE * 2)(%rax), %YMM4 + VMOVA (VEC_SIZE * 3)(%rax), %YMM6 + VMOVU (%rdx), %YMM1 + VMOVU VEC_SIZE(%rdx), %YMM3 + VMOVU (VEC_SIZE * 2)(%rdx), %YMM5 + VMOVU (VEC_SIZE * 3)(%rdx), %YMM7 + + VPCMP $4, %YMM0, %YMM1, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM1, %k2 + kord %k1, %k2, %k1 + /* Each bit in K4 represents a NULL or a mismatch in YMM0 and + YMM1. */ + kord %k0, %k1, %k4 + + VPCMP $4, %YMM2, %YMM3, %k0 + VPCMP $0, %YMMZERO, %YMM2, %k1 + VPCMP $0, %YMMZERO, %YMM3, %k2 + kord %k1, %k2, %k1 + /* Each bit in K5 represents a NULL or a mismatch in YMM2 and + YMM3. */ + kord %k0, %k1, %k5 + + VPCMP $4, %YMM4, %YMM5, %k0 + VPCMP $0, %YMMZERO, %YMM4, %k1 + VPCMP $0, %YMMZERO, %YMM5, %k2 + kord %k1, %k2, %k1 + /* Each bit in K6 represents a NULL or a mismatch in YMM4 and + YMM5. */ + kord %k0, %k1, %k6 + + VPCMP $4, %YMM6, %YMM7, %k0 + VPCMP $0, %YMMZERO, %YMM6, %k1 + VPCMP $0, %YMMZERO, %YMM7, %k2 + kord %k1, %k2, %k1 + /* Each bit in K7 represents a NULL or a mismatch in YMM6 and + YMM7. */ + kord %k0, %k1, %k7 + + kord %k4, %k5, %k0 + kord %k6, %k7, %k1 + + /* Test each mask (32 bits) individually because for VEC_SIZE + == 32 is not possible to OR the four masks and keep all bits + in a 64-bit integer register, differing from SSE2 strcmp + where ORing is possible. */ + kortestd %k0, %k1 + je L(loop) + ktestd %k4, %k4 + je L(test_vec) + kmovd %k4, %edi + tzcntl %edi, %ecx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +# endif +# ifdef USE_AS_STRNCMP + cmpq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(test_vec): +# ifdef USE_AS_STRNCMP + /* The first vector matched. Return 0 if the maximum offset + (%r11) <= VEC_SIZE. */ + cmpq $VEC_SIZE, %r11 + jbe L(zero) +# endif + ktestd %k5, %k5 + je L(test_2_vec) + kmovd %k5, %ecx + tzcntl %ecx, %edi +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edi +# endif +# ifdef USE_AS_STRNCMP + addq $VEC_SIZE, %rdi + cmpq %rdi, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rdi), %ecx + cmpl (%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl (%rax, %rdi), %eax + movzbl (%rdx, %rdi), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl VEC_SIZE(%rsi, %rdi), %ecx + cmpl VEC_SIZE(%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl VEC_SIZE(%rax, %rdi), %eax + movzbl VEC_SIZE(%rdx, %rdi), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(test_2_vec): +# ifdef USE_AS_STRNCMP + /* The first 2 vectors matched. Return 0 if the maximum offset + (%r11) <= 2 * VEC_SIZE. */ + cmpq $(VEC_SIZE * 2), %r11 + jbe L(zero) +# endif + ktestd %k6, %k6 + je L(test_3_vec) + kmovd %k6, %ecx + tzcntl %ecx, %edi +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edi +# endif +# ifdef USE_AS_STRNCMP + addq $(VEC_SIZE * 2), %rdi + cmpq %rdi, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rdi), %ecx + cmpl (%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl (%rax, %rdi), %eax + movzbl (%rdx, %rdi), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx + cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax + movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(test_3_vec): +# ifdef USE_AS_STRNCMP + /* The first 3 vectors matched. Return 0 if the maximum offset + (%r11) <= 3 * VEC_SIZE. */ + cmpq $(VEC_SIZE * 3), %r11 + jbe L(zero) +# endif + kmovd %k7, %esi + tzcntl %esi, %ecx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +# endif +# ifdef USE_AS_STRNCMP + addq $(VEC_SIZE * 3), %rcx + cmpq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %esi + cmpl (%rdx, %rcx), %esi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rsi, %rcx), %esi + cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(loop_cross_page): + xorl %r10d, %r10d + movq %rdx, %rcx + /* Align load via RDX. We load the extra ECX bytes which should + be ignored. */ + andl $((VEC_SIZE * 4) - 1), %ecx + /* R10 is -RCX. */ + subq %rcx, %r10 + + /* This works only if VEC_SIZE * 2 == 64. */ +# if (VEC_SIZE * 2) != 64 +# error (VEC_SIZE * 2) != 64 +# endif + + /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ + cmpl $(VEC_SIZE * 2), %ecx + jge L(loop_cross_page_2_vec) + + VMOVU (%rax, %r10), %YMM2 + VMOVU VEC_SIZE(%rax, %r10), %YMM3 + VMOVU (%rdx, %r10), %YMM4 + VMOVU VEC_SIZE(%rdx, %r10), %YMM5 + + VPCMP $4, %YMM4, %YMM2, %k0 + VPCMP $0, %YMMZERO, %YMM2, %k1 + VPCMP $0, %YMMZERO, %YMM4, %k2 + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch in YMM2 and + YMM4. */ + kord %k0, %k1, %k1 + + VPCMP $4, %YMM5, %YMM3, %k3 + VPCMP $0, %YMMZERO, %YMM3, %k4 + VPCMP $0, %YMMZERO, %YMM5, %k5 + kord %k4, %k5, %k4 + /* Each bit in K3 represents a NULL or a mismatch in YMM3 and + YMM5. */ + kord %k3, %k4, %k3 + +# ifdef USE_AS_WCSCMP + /* NB: Each bit in K1/K3 represents 4-byte element. */ + kshiftlw $8, %k3, %k2 + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG32 + sarl $2, %SHIFT_REG32 +# else + kshiftlq $32, %k3, %k2 +# endif + + /* Each bit in K1 represents a NULL or a mismatch. */ + korq %k1, %k2, %k1 + kmovq %k1, %rdi + + /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ + shrxq %SHIFT_REG64, %rdi, %rdi + testq %rdi, %rdi + je L(loop_cross_page_2_vec) + tzcntq %rdi, %rcx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +# endif +# ifdef USE_AS_STRNCMP + cmpq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(loop_cross_page_2_vec): + /* The first VEC_SIZE * 2 bytes match or are ignored. */ + VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 + VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 + VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2 + VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3 + + VPCMP $4, %YMM0, %YMM2, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM2, %k2 + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch in YMM0 and + YMM2. */ + kord %k0, %k1, %k1 + + VPCMP $4, %YMM1, %YMM3, %k3 + VPCMP $0, %YMMZERO, %YMM1, %k4 + VPCMP $0, %YMMZERO, %YMM3, %k5 + kord %k4, %k5, %k4 + /* Each bit in K3 represents a NULL or a mismatch in YMM1 and + YMM3. */ + kord %k3, %k4, %k3 + +# ifdef USE_AS_WCSCMP + /* NB: Each bit in K1/K3 represents 4-byte element. */ + kshiftlw $8, %k3, %k2 +# else + kshiftlq $32, %k3, %k2 +# endif + + /* Each bit in K1 represents a NULL or a mismatch. */ + korq %k1, %k2, %k1 + kmovq %k1, %rdi + + xorl %r8d, %r8d + /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ + subl $(VEC_SIZE * 2), %ecx + jle 1f + /* R8 has number of bytes skipped. */ + movl %ecx, %r8d +# ifdef USE_AS_WCSCMP + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + sarl $2, %ecx +# endif + /* Skip ECX bytes. */ + shrq %cl, %rdi +1: + /* Before jumping back to the loop, set ESI to the number of + VEC_SIZE * 4 blocks before page crossing. */ + movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi + + testq %rdi, %rdi +# ifdef USE_AS_STRNCMP + /* At this point, if %rdi value is 0, it already tested + VEC_SIZE*4+%r10 byte starting from %rax. This label + checks whether strncmp maximum offset reached or not. */ + je L(string_nbyte_offset_check) +# else + je L(back_to_loop) +# endif + tzcntq %rdi, %rcx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +# endif + addq %r10, %rcx + /* Adjust for number of bytes skipped. */ + addq %r8, %rcx +# ifdef USE_AS_STRNCMP + addq $(VEC_SIZE * 2), %rcx + subq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rsi, %rcx), %edi + cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + ret + +# ifdef USE_AS_STRNCMP +L(string_nbyte_offset_check): + leaq (VEC_SIZE * 4)(%r10), %r10 + cmpq %r10, %r11 + jbe L(zero) + jmp L(back_to_loop) +# endif + + .p2align 4 +L(cross_page_loop): + /* Check one byte/dword at a time. */ +# ifdef USE_AS_WCSCMP + cmpl %ecx, %eax +# else + subl %ecx, %eax +# endif + jne L(different) + addl $SIZE_OF_CHAR, %edx + cmpl $(VEC_SIZE * 4), %edx + je L(main_loop_header) +# ifdef USE_AS_STRNCMP + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + movl (%rdi, %rdx), %eax + movl (%rsi, %rdx), %ecx +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx +# endif + /* Check null char. */ + testl %eax, %eax + jne L(cross_page_loop) + /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED + comparisons. */ + subl %ecx, %eax +# ifndef USE_AS_WCSCMP +L(different): +# endif + ret + +# ifdef USE_AS_WCSCMP + .p2align 4 +L(different): + /* Use movl to avoid modifying EFLAGS. */ + movl $0, %eax + setl %al + negl %eax + orl $1, %eax + ret +# endif + +# ifdef USE_AS_STRNCMP + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(char0): +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi), %ecx + cmpl (%rsi), %ecx + jne L(wcscmp_return) +# else + movzbl (%rsi), %ecx + movzbl (%rdi), %eax + subl %ecx, %eax +# endif + ret +# endif + + .p2align 4 +L(last_vector): + addq %rdx, %rdi + addq %rdx, %rsi +# ifdef USE_AS_STRNCMP + subq %rdx, %r11 +# endif + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif + ret + + /* Comparing on page boundary region requires special treatment: + It must done one vector at the time, starting with the wider + ymm vector if possible, if not, with xmm. If fetching 16 bytes + (xmm) still passes the boundary, byte comparison must be done. + */ + .p2align 4 +L(cross_page): + /* Try one ymm vector at a time. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(cross_page_1_vector) +L(loop_1_vector): + VMOVU (%rdi, %rdx), %YMM0 + VMOVU (%rsi, %rdx), %YMM1 + + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ + VPCMP $4, %YMM0, %YMM1, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM1, %k2 + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jne L(last_vector) + + addl $VEC_SIZE, %edx + + addl $VEC_SIZE, %eax +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jle L(loop_1_vector) +L(cross_page_1_vector): + /* Less than 32 bytes to check, try one xmm vector. */ + cmpl $(PAGE_SIZE - 16), %eax + jg L(cross_page_1_xmm) + VMOVU (%rdi, %rdx), %XMM0 + VMOVU (%rsi, %rdx), %XMM1 + + /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ + VPCMP $4, %XMM0, %XMM1, %k0 + VPCMP $0, %XMMZERO, %XMM0, %k1 + VPCMP $0, %XMMZERO, %XMM1, %k2 + /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ + korw %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + korw %k0, %k1, %k1 + kmovw %k1, %ecx + testl %ecx, %ecx + jne L(last_vector) + + addl $16, %edx +# ifndef USE_AS_WCSCMP + addl $16, %eax +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + +L(cross_page_1_xmm): +# ifndef USE_AS_WCSCMP + /* Less than 16 bytes to check, try 8 byte vector. NB: No need + for wcscmp nor wcsncmp since wide char is 4 bytes. */ + cmpl $(PAGE_SIZE - 8), %eax + jg L(cross_page_8bytes) + vmovq (%rdi, %rdx), %XMM0 + vmovq (%rsi, %rdx), %XMM1 + + /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ + VPCMP $4, %XMM0, %XMM1, %k0 + VPCMP $0, %XMMZERO, %XMM0, %k1 + VPCMP $0, %XMMZERO, %XMM1, %k2 + /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + kmovd %k1, %ecx + +# ifdef USE_AS_WCSCMP + /* Only last 2 bits are valid. */ + andl $0x3, %ecx +# else + /* Only last 8 bits are valid. */ + andl $0xff, %ecx +# endif + + testl %ecx, %ecx + jne L(last_vector) + + addl $8, %edx + addl $8, %eax +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + +L(cross_page_8bytes): + /* Less than 8 bytes to check, try 4 byte vector. */ + cmpl $(PAGE_SIZE - 4), %eax + jg L(cross_page_4bytes) + vmovd (%rdi, %rdx), %XMM0 + vmovd (%rsi, %rdx), %XMM1 + + /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ + VPCMP $4, %XMM0, %XMM1, %k0 + VPCMP $0, %XMMZERO, %XMM0, %k1 + VPCMP $0, %XMMZERO, %XMM1, %k2 + /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + kmovd %k1, %ecx + +# ifdef USE_AS_WCSCMP + /* Only the last bit is valid. */ + andl $0x1, %ecx +# else + /* Only last 4 bits are valid. */ + andl $0xf, %ecx +# endif + + testl %ecx, %ecx + jne L(last_vector) + + addl $4, %edx +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + +L(cross_page_4bytes): +# endif + /* Less than 4 bytes to check, try one byte/dword at a time. */ +# ifdef USE_AS_STRNCMP + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + movl (%rdi, %rdx), %eax + movl (%rsi, %rdx), %ecx +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx +# endif + testl %eax, %eax + jne L(cross_page_loop) + subl %ecx, %eax + ret +END (STRCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index 16ae72a4c8..df4ba875d9 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -30,16 +30,29 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S new file mode 100644 index 0000000000..c2c581ecf7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCPY +# define STRCPY __strcpy_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S index 3f2f9e8170..1ce17253ab 100644 --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S @@ -37,6 +37,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + /* zero register */ #define xmmZ xmm0 #define ymmZ ymm0 @@ -46,7 +50,7 @@ # ifndef USE_AS_STRCAT - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCPY) # ifdef USE_AS_STRNCPY mov %RDX_LP, %R8_LP @@ -369,8 +373,8 @@ L(CopyVecSizeExit): lea 1(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(CopyTwoVecSize1): @@ -553,8 +557,7 @@ L(Exit1): lea 2(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit2): @@ -569,8 +572,7 @@ L(Exit2): lea 3(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit3): @@ -584,8 +586,7 @@ L(Exit3): lea 4(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit4_7): @@ -602,8 +603,7 @@ L(Exit4_7): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit8_15): @@ -620,8 +620,7 @@ L(Exit8_15): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit16_31): @@ -638,8 +637,7 @@ L(Exit16_31): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit32_63): @@ -656,8 +654,7 @@ L(Exit32_63): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_STRNCPY @@ -671,8 +668,7 @@ L(StrncpyExit1): # ifdef USE_AS_STRCAT movb $0, 1(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit2): @@ -684,8 +680,7 @@ L(StrncpyExit2): # ifdef USE_AS_STRCAT movb $0, 2(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit3_4): @@ -699,8 +694,7 @@ L(StrncpyExit3_4): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit5_8): @@ -714,8 +708,7 @@ L(StrncpyExit5_8): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit9_16): @@ -729,8 +722,7 @@ L(StrncpyExit9_16): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit17_32): @@ -744,8 +736,7 @@ L(StrncpyExit17_32): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit33_64): @@ -760,8 +751,7 @@ L(StrncpyExit33_64): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit65): @@ -778,50 +768,43 @@ L(StrncpyExit65): # ifdef USE_AS_STRCAT movb $0, 65(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifndef USE_AS_STRCAT .p2align 4 L(Fill1): mov %dl, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill2): mov %dx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill3_4): mov %dx, (%rdi) mov %dx, -2(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill5_8): mov %edx, (%rdi) mov %edx, -4(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill9_16): mov %rdx, (%rdi) mov %rdx, -8(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill17_32): vmovdqu %xmmZ, (%rdi) vmovdqu %xmmZ, -16(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(CopyVecSizeUnalignedVec2): @@ -898,8 +881,7 @@ L(Fill): cmp $1, %r8d ja L(Fill2) je L(Fill1) - VZEROUPPER - ret + VZEROUPPER_RETURN /* end of ifndef USE_AS_STRCAT */ # endif @@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3): # ifdef USE_AS_STRCAT movb $0, (VEC_SIZE * 4)(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(UnalignedFourVecSizeLeaveCase2): @@ -1001,16 +982,14 @@ L(StrncpyExit): # ifdef USE_AS_STRCAT movb $0, (%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(ExitZero): # ifndef USE_AS_STRCAT mov %rdi, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # endif diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S new file mode 100644 index 0000000000..a343a1a692 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S @@ -0,0 +1,1003 @@ +/* strcpy with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include + +# ifndef STRCPY +# define STRCPY __strcpy_evex +# endif + +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +/* Number of bytes in a vector register */ +# ifndef VEC_SIZE +# define VEC_SIZE 32 +# endif + +# define XMM2 xmm18 +# define XMM3 xmm19 + +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 +# define YMM7 ymm23 + +# ifndef USE_AS_STRCAT + +/* zero register */ +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMM1 ymm17 + + .section .text.evex,"ax",@progbits +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %RDX_LP, %R8_LP + test %R8_LP, %R8_LP + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + + vpxorq %XMMZERO, %XMMZERO, %XMMZERO +# endif + + and $((VEC_SIZE * 4) - 1), %ecx + cmp $(VEC_SIZE * 2), %ecx + jbe L(SourceStringAlignmentLessTwoVecSize) + + and $-VEC_SIZE, %rsi + and $(VEC_SIZE - 1), %ecx + + vpcmpb $0, (%rsi), %YMMZERO, %k0 + kmovd %k0, %edx + shr %cl, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + mov $VEC_SIZE, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# else + mov $(VEC_SIZE + 1), %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# endif + jbe L(CopyVecSizeTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyVecSizeTail) + + vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 + kmovd %k1, %edx + +# ifdef USE_AS_STRNCPY + add $VEC_SIZE, %r10 + cmp %r10, %r8 + jbe L(CopyTwoVecSizeCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyTwoVecSize) + + VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ + VMOVU %YMM2, (%rdi) + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(UnalignVecSizeBoth): + sub %rcx, %rdi +# ifdef USE_AS_STRNCPY + add %rcx, %r8 + sbb %rcx, %rcx + or %rcx, %r8 +# endif + mov $VEC_SIZE, %rcx + VMOVA (%rsi, %rcx), %YMM2 + VMOVU %YMM2, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 3), %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM2, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 + vpcmpb $0, %YMM3, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec3) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM3, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 + vpcmpb $0, %YMM4, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec4) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM4, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM2, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 + VMOVU %YMM2, (%rdi, %rcx) + vpcmpb $0, %YMM3, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec3) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM3, (%rdi, %rcx) + mov %rsi, %rdx + lea VEC_SIZE(%rsi, %rcx), %rsi + and $-(VEC_SIZE * 4), %rsi + sub %rsi, %rdx + sub %rdx, %rdi +# ifdef USE_AS_STRNCPY + lea (VEC_SIZE * 8)(%r8, %rdx), %r8 +# endif +L(UnalignedFourVecSizeLoop): + VMOVA (%rsi), %YMM4 + VMOVA VEC_SIZE(%rsi), %YMM5 + VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 + VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 + vpminub %YMM5, %YMM4, %YMM2 + vpminub %YMM7, %YMM6, %YMM3 + vpminub %YMM2, %YMM3, %YMM2 + /* If K7 != 0, there is a null byte. */ + vpcmpb $0, %YMM2, %YMMZERO, %k7 + kmovd %k7, %edx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 4), %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jnz L(UnalignedFourVecSizeLeave) + +L(UnalignedFourVecSizeLoop_start): + add $(VEC_SIZE * 4), %rdi + add $(VEC_SIZE * 4), %rsi + VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) + VMOVA (%rsi), %YMM4 + VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) + VMOVA VEC_SIZE(%rsi), %YMM5 + vpminub %YMM5, %YMM4, %YMM2 + VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) + VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 + VMOVU %YMM7, -VEC_SIZE(%rdi) + VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 + vpminub %YMM7, %YMM6, %YMM3 + vpminub %YMM2, %YMM3, %YMM2 + /* If K7 != 0, there is a null byte. */ + vpcmpb $0, %YMM2, %YMMZERO, %k7 + kmovd %k7, %edx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 4), %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jz L(UnalignedFourVecSizeLoop_start) + +L(UnalignedFourVecSizeLeave): + vpcmpb $0, %YMM4, %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(CopyVecSizeUnaligned_0) + + vpcmpb $0, %YMM5, %YMMZERO, %k2 + kmovd %k2, %ecx + test %ecx, %ecx + jnz L(CopyVecSizeUnaligned_16) + + vpcmpb $0, %YMM6, %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(CopyVecSizeUnaligned_32) + + vpcmpb $0, %YMM7, %YMMZERO, %k4 + kmovd %k4, %ecx + bsf %ecx, %edx + VMOVU %YMM4, (%rdi) + VMOVU %YMM5, VEC_SIZE(%rdi) + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 3)(%rdi, %rdx), %rax +# endif + VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) + add $(VEC_SIZE - 1), %r8 + sub %rdx, %r8 + lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $(VEC_SIZE * 3), %rsi + add $(VEC_SIZE * 3), %rdi + jmp L(CopyVecSizeExit) +# endif + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentLessTwoVecSize): + VMOVU (%rsi), %YMM3 + VMOVU VEC_SIZE(%rsi), %YMM2 + vpcmpb $0, %YMM3, %YMMZERO, %k0 + kmovd %k0, %edx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $VEC_SIZE, %r8 +# else + cmp $(VEC_SIZE + 1), %r8 +# endif + jbe L(CopyVecSizeTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyVecSizeTail1) + + VMOVU %YMM3, (%rdi) + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $(VEC_SIZE * 2), %r8 +# else + cmp $((VEC_SIZE * 2) + 1), %r8 +# endif + jbe L(CopyTwoVecSize1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyTwoVecSize1) + + and $-VEC_SIZE, %rsi + and $(VEC_SIZE - 1), %ecx + jmp L(UnalignVecSizeBoth) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + +# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) + .p2align 4 +L(CopyVecSize): + add %rcx, %rdi +# endif +L(CopyVecSizeTail): + add %rcx, %rsi +L(CopyVecSizeTail1): + bsf %edx, %edx +L(CopyVecSizeExit): + cmp $32, %edx + jae L(Exit32_63) + cmp $16, %edx + jae L(Exit16_31) + cmp $8, %edx + jae L(Exit8_15) + cmp $4, %edx + jae L(Exit4_7) + cmp $3, %edx + je L(Exit3) + cmp $1, %edx + ja L(Exit2) + je L(Exit1) + movb $0, (%rdi) +# ifdef USE_AS_STPCPY + lea (%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(CopyTwoVecSize1): + add $VEC_SIZE, %rsi + add $VEC_SIZE, %rdi +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $VEC_SIZE, %r8 +# endif + jmp L(CopyVecSizeTail1) + + .p2align 4 +L(CopyTwoVecSize): + bsf %edx, %edx + add %rcx, %rsi + add $VEC_SIZE, %edx + sub %ecx, %edx + jmp L(CopyVecSizeExit) + + .p2align 4 +L(CopyVecSizeUnaligned_0): + bsf %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + VMOVU %YMM4, (%rdi) + add $((VEC_SIZE * 4) - 1), %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + jmp L(CopyVecSizeExit) +# endif + + .p2align 4 +L(CopyVecSizeUnaligned_16): + bsf %ecx, %edx + VMOVU %YMM4, (%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea VEC_SIZE(%rdi, %rdx), %rax +# endif + VMOVU %YMM5, VEC_SIZE(%rdi) + add $((VEC_SIZE * 3) - 1), %r8 + sub %rdx, %r8 + lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $VEC_SIZE, %rsi + add $VEC_SIZE, %rdi + jmp L(CopyVecSizeExit) +# endif + + .p2align 4 +L(CopyVecSizeUnaligned_32): + bsf %edx, %edx + VMOVU %YMM4, (%rdi) + VMOVU %YMM5, VEC_SIZE(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 2)(%rdi, %rdx), %rax +# endif + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) + add $((VEC_SIZE * 2) - 1), %r8 + sub %rdx, %r8 + lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $(VEC_SIZE * 2), %rsi + add $(VEC_SIZE * 2), %rdi + jmp L(CopyVecSizeExit) +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + .p2align 4 +L(CopyVecSizeUnalignedVec6): + VMOVU %YMM6, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec5): + VMOVU %YMM5, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec4): + VMOVU %YMM4, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec3): + VMOVU %YMM3, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) +# endif + +/* Case2 */ + + .p2align 4 +L(CopyVecSizeCase2): + add $VEC_SIZE, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSizeCase2): + add %rcx, %rsi + bsf %edx, %edx + add $VEC_SIZE, %edx + sub %ecx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +L(CopyVecSizeTailCase2): + add %rcx, %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +L(CopyVecSizeTail1Case2): + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyVecSizeCase2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeCase2) +L(CopyVecSizeCase3): + add $VEC_SIZE, %r8 + add %rcx, %rdi + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSizeCase2OrCase3): + test %rdx, %rdx + jnz L(CopyTwoVecSizeCase2) + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyVecSizeTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeTailCase2) + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSize1Case2OrCase3): + add $VEC_SIZE, %rdi + add $VEC_SIZE, %rsi + sub $VEC_SIZE, %r8 +L(CopyVecSizeTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeTail1Case2) + jmp L(StrncpyExit) +# endif + +/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ + + .p2align 4 +L(Exit1): + movzwl (%rsi), %edx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit2): + movzwl (%rsi), %ecx + mov %cx, (%rdi) + movb $0, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit3): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit4_7): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov -3(%rsi, %rdx), %ecx + mov %ecx, -3(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit8_15): + mov (%rsi), %rcx + mov -7(%rsi, %rdx), %r9 + mov %rcx, (%rdi) + mov %r9, -7(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit16_31): + VMOVU (%rsi), %XMM2 + VMOVU -15(%rsi, %rdx), %XMM3 + VMOVU %XMM2, (%rdi) + VMOVU %XMM3, -15(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit32_63): + VMOVU (%rsi), %YMM2 + VMOVU -31(%rsi, %rdx), %YMM3 + VMOVU %YMM2, (%rdi) + VMOVU %YMM3, -31(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit1): + movzbl (%rsi), %edx + mov %dl, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 1(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit2): + movzwl (%rsi), %edx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 2(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit3_4): + movzwl (%rsi), %ecx + movzwl -2(%rsi, %r8), %edx + mov %cx, (%rdi) + mov %dx, -2(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit5_8): + mov (%rsi), %ecx + mov -4(%rsi, %r8), %edx + mov %ecx, (%rdi) + mov %edx, -4(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit9_16): + mov (%rsi), %rcx + mov -8(%rsi, %r8), %rdx + mov %rcx, (%rdi) + mov %rdx, -8(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit17_32): + VMOVU (%rsi), %XMM2 + VMOVU -16(%rsi, %r8), %XMM3 + VMOVU %XMM2, (%rdi) + VMOVU %XMM3, -16(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit33_64): + /* 0/32, 31/16 */ + VMOVU (%rsi), %YMM2 + VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 + VMOVU %YMM2, (%rdi) + VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit65): + /* 0/32, 32/32, 64/1 */ + VMOVU (%rsi), %YMM2 + VMOVU 32(%rsi), %YMM3 + mov 64(%rsi), %cl + VMOVU %YMM2, (%rdi) + VMOVU %YMM3, 32(%rdi) + mov %cl, 64(%rdi) +# ifdef USE_AS_STPCPY + lea 65(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 65(%rdi) +# endif + ret + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + ret + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + ret + + .p2align 4 +L(Fill3_4): + mov %dx, (%rdi) + mov %dx, -2(%rdi, %r8) + ret + + .p2align 4 +L(Fill5_8): + mov %edx, (%rdi) + mov %edx, -4(%rdi, %r8) + ret + + .p2align 4 +L(Fill9_16): + mov %rdx, (%rdi) + mov %rdx, -8(%rdi, %r8) + ret + + .p2align 4 +L(Fill17_32): + VMOVU %XMMZERO, (%rdi) + VMOVU %XMMZERO, -16(%rdi, %r8) + ret + + .p2align 4 +L(CopyVecSizeUnalignedVec2): + VMOVU %YMM2, (%rdi, %rcx) + + .p2align 4 +L(CopyVecSizeVecExit): + bsf %edx, %edx + add $(VEC_SIZE - 1), %r8 + add %rcx, %rdi +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + xor %edx, %edx + sub $VEC_SIZE, %r8 + jbe L(StrncpyFillExit) + + VMOVU %YMMZERO, (%rdi) + add $VEC_SIZE, %rdi + + mov %rdi, %rsi + and $(VEC_SIZE - 1), %esi + sub %rsi, %rdi + add %rsi, %r8 + sub $(VEC_SIZE * 4), %r8 + jb L(StrncpyFillLessFourVecSize) + +L(StrncpyFillLoopVmovdqa): + VMOVA %YMMZERO, (%rdi) + VMOVA %YMMZERO, VEC_SIZE(%rdi) + VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) + VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) + add $(VEC_SIZE * 4), %rdi + sub $(VEC_SIZE * 4), %r8 + jae L(StrncpyFillLoopVmovdqa) + +L(StrncpyFillLessFourVecSize): + add $(VEC_SIZE * 2), %r8 + jl L(StrncpyFillLessTwoVecSize) + VMOVA %YMMZERO, (%rdi) + VMOVA %YMMZERO, VEC_SIZE(%rdi) + add $(VEC_SIZE * 2), %rdi + sub $VEC_SIZE, %r8 + jl L(StrncpyFillExit) + VMOVA %YMMZERO, (%rdi) + add $VEC_SIZE, %rdi + jmp L(Fill) + + .p2align 4 +L(StrncpyFillLessTwoVecSize): + add $VEC_SIZE, %r8 + jl L(StrncpyFillExit) + VMOVA %YMMZERO, (%rdi) + add $VEC_SIZE, %rdi + jmp L(Fill) + + .p2align 4 +L(StrncpyFillExit): + add $VEC_SIZE, %r8 +L(Fill): + cmp $17, %r8d + jae L(Fill17_32) + cmp $9, %r8d + jae L(Fill9_16) + cmp $5, %r8d + jae L(Fill5_8) + cmp $3, %r8d + jae L(Fill3_4) + cmp $1, %r8d + ja L(Fill2) + je L(Fill1) + ret + +/* end of ifndef USE_AS_STRCAT */ +# endif + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(UnalignedFourVecSizeLeaveCase2) +L(UnalignedFourVecSizeLeaveCase3): + lea (VEC_SIZE * 4)(%r8), %rcx + and $-VEC_SIZE, %rcx + add $(VEC_SIZE * 3), %r8 + jl L(CopyVecSizeCase3) + VMOVU %YMM4, (%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + VMOVU %YMM5, VEC_SIZE(%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 4)(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (VEC_SIZE * 4)(%rdi) +# endif + ret + + .p2align 4 +L(UnalignedFourVecSizeLeaveCase2): + xor %ecx, %ecx + vpcmpb $0, %YMM4, %YMMZERO, %k1 + kmovd %k1, %edx + add $(VEC_SIZE * 3), %r8 + jle L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec4) +# else + jnz L(CopyVecSize) +# endif + vpcmpb $0, %YMM5, %YMMZERO, %k2 + kmovd %k2, %edx + VMOVU %YMM4, (%rdi) + add $VEC_SIZE, %rcx + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec5) +# else + jnz L(CopyVecSize) +# endif + + vpcmpb $0, %YMM6, %YMMZERO, %k3 + kmovd %k3, %edx + VMOVU %YMM5, VEC_SIZE(%rdi) + add $VEC_SIZE, %rcx + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec6) +# else + jnz L(CopyVecSize) +# endif + + vpcmpb $0, %YMM7, %YMMZERO, %k4 + kmovd %k4, %edx + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) + lea VEC_SIZE(%rdi, %rcx), %rdi + lea VEC_SIZE(%rsi, %rcx), %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) +L(StrncpyExit): + cmp $65, %r8d + je L(StrncpyExit65) + cmp $33, %r8d + jae L(StrncpyExit33_64) + cmp $17, %r8d + jae L(StrncpyExit17_32) + cmp $9, %r8d + jae L(StrncpyExit9_16) + cmp $5, %r8d + jae L(StrncpyExit5_8) + cmp $3, %r8d + jae L(StrncpyExit3_4) + cmp $1, %r8d + ja L(StrncpyExit2) + je L(StrncpyExit1) +# ifdef USE_AS_STPCPY + mov %rdi, %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi) +# endif + ret + + .p2align 4 +L(ExitZero): +# ifndef USE_AS_STRCAT + mov %rdi, %rax +# endif + ret + +# endif + +# ifndef USE_AS_STRCAT +END (STRCPY) +# else +END (STRCAT) +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S new file mode 100644 index 0000000000..75b4b7612c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRLEN +# define STRLEN __strlen_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strlen-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S index 73421ec1b2..45e08e64d6 100644 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S @@ -27,370 +27,531 @@ # ifdef USE_AS_WCSLEN # define VPCMPEQ vpcmpeqd # define VPMINU vpminud +# define CHAR_SIZE 4 # else # define VPCMPEQ vpcmpeqb # define VPMINU vpminub +# define CHAR_SIZE 1 # endif # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN - /* Check for zero length. */ + /* Check zero length. */ +# ifdef __ILP32__ + /* Clear upper bits. */ + and %RSI_LP, %RSI_LP +# else test %RSI_LP, %RSI_LP - jz L(zero) -# ifdef USE_AS_WCSLEN - shl $2, %RSI_LP -# elif defined __ILP32__ - /* Clear the upper 32 bits. */ - movl %esi, %esi # endif + jz L(zero) + /* Store max len in R8_LP before adjusting if using WCSLEN. */ mov %RSI_LP, %R8_LP # endif - movl %edi, %ecx + movl %edi, %eax movq %rdi, %rdx vpxor %xmm0, %xmm0, %xmm0 - + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. */ + andl $(PAGE_SIZE - 1), %eax /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 + VPCMPEQ (%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax - testl %eax, %eax - # ifdef USE_AS_STRNLEN - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rsi - jbe L(max) -# else - jnz L(first_vec_x0) + /* If length < VEC_SIZE handle special. */ + cmpq $CHAR_PER_VEC, %rsi + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi + /* If empty continue to aligned_more. Otherwise return bit + position of first match. */ + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax +# endif + VZEROUPPER_RETURN # ifdef USE_AS_STRNLEN - /* Adjust length. */ - addq %rcx, %rsi +L(zero): + xorl %eax, %eax + ret - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + .p2align 4 +L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ +# ifdef USE_AS_WCSLEN + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %esi +# endif + btsq %rsi, %rax + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax +# endif + VZEROUPPER_RETURN # endif - jmp L(more_4x_vec) .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - /* Remove the leading bytes. */ - sarl %cl, %eax - testl %eax, %eax - jz L(aligned_more) +L(first_vec_x1): tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ +# ifdef USE_AS_WCSLEN + leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax +# else + subl $(VEC_SIZE * 4 + 1), %ecx + addl %ecx, %eax +# endif +# else + subl %edx, %edi + incl %edi + addl %edi, %eax # endif - addq %rdi, %rax - addq %rcx, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 -L(aligned_more): +L(first_vec_x2): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" - to void possible addition overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx - - /* Check the end of data. */ - subq %rcx, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ +# ifdef USE_AS_WCSLEN + leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax +# else + subl $(VEC_SIZE * 3 + 1), %ecx + addl %ecx, %eax +# endif +# else + subl %edx, %edi + addl $(VEC_SIZE + 1), %edi + addl %edi, %eax # endif +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax +# endif + VZEROUPPER_RETURN - addq $VEC_SIZE, %rdi + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ +# ifdef USE_AS_WCSLEN + leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax +# else + subl $(VEC_SIZE * 2 + 1), %ecx + addl %ecx, %eax +# endif +# else + subl %edx, %edi + addl $(VEC_SIZE * 2 + 1), %edi + addl %edi, %eax +# endif +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax +# endif + VZEROUPPER_RETURN + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + /* Use ecx which was computed earlier to compute correct value. + */ +# ifdef USE_AS_WCSLEN + leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax +# else + subl $(VEC_SIZE + 1), %ecx + addl %ecx, %eax +# endif +# else + subl %edx, %edi + addl $(VEC_SIZE * 3 + 1), %edi + addl %edi, %eax # endif +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax +# endif + VZEROUPPER_RETURN -L(more_4x_vec): + .p2align 5 +L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of + instructions as using andq with -VEC_SIZE but saves 4 bytes of + code on the x4 check. */ + orq $(VEC_SIZE - 1), %rdi +L(cross_page_continue): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +# ifdef USE_AS_STRNLEN + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE + because it simplies the logic in last_4x_vec_or_less. */ + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx + subq %rdx, %rcx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif +# endif + /* Load first VEC regardless. */ + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +# ifdef USE_AS_STRNLEN + /* Adjust length. If near end handle specially. */ + subq %rcx, %rsi + jb L(last_4x_vec_or_less) +# endif vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x1) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x2) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi - -# ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) + /* Align data to VEC_SIZE * 4 - 1. */ # ifdef USE_AS_STRNLEN - /* Adjust length. */ + /* Before adjusting length check if at last VEC_SIZE * 4. */ + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + incq %rdi + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif + /* Readjust length. */ addq %rcx, %rsi +# else + incq %rdi + orq $(VEC_SIZE * 4 - 1), %rdi # endif - + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - vmovdqa (%rdi), %ymm1 - vmovdqa VEC_SIZE(%rdi), %ymm2 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 - VPMINU %ymm1, %ymm2, %ymm5 - VPMINU %ymm3, %ymm4, %ymm6 - VPMINU %ymm5, %ymm6, %ymm5 - +# ifdef USE_AS_STRNLEN + /* Break if at end of length. */ + subq $(CHAR_PER_VEC * 4), %rsi + jb L(last_4x_vec_or_less_cmpeq) +# endif + /* Save some code size by microfusing VPMINU with the load. + Since the matches in ymm2/ymm4 can only be returned if there + where no matches in ymm1/ymm3 respectively there is no issue + with overlap. */ + vmovdqa 1(%rdi), %ymm1 + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 + + VPMINU %ymm2, %ymm4, %ymm5 VPCMPEQ %ymm5, %ymm0, %ymm5 - vpmovmskb %ymm5, %eax - testl %eax, %eax - jnz L(4x_vec_end) + vpmovmskb %ymm5, %ecx - addq $(VEC_SIZE * 4), %rdi + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) -# ifndef USE_AS_STRNLEN - jmp L(loop_4x_vec) -# else - subq $(VEC_SIZE * 4), %rsi - ja L(loop_4x_vec) -L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %esi - jle L(last_2x_vec) - - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm1 vpmovmskb %ymm1, %eax + subq %rdx, %rdi testl %eax, %eax - jnz L(first_vec_x1) + jnz L(last_vec_return_x0) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm2, %ymm0, %ymm2 + vpmovmskb %ymm2, %eax testl %eax, %eax + jnz L(last_vec_return_x1) - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %esi - jle L(max) + /* Combine last 2 VEC. */ + VPCMPEQ %ymm3, %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + /* rcx has combined result from all 4 VEC. It will only be used + if the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax + subq $(VEC_SIZE * 2 - 1), %rdi + addq %rdi, %rax +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax +# endif + VZEROUPPER_RETURN - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x3_check) - movq %r8, %rax +# ifdef USE_AS_STRNLEN + .p2align 4 +L(last_4x_vec_or_less_load): + /* Depending on entry adjust rdi / prepare first VEC in ymm1. + */ + subq $-(VEC_SIZE * 4), %rdi +L(last_4x_vec_or_less_cmpeq): + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +L(last_4x_vec_or_less): # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %esi # endif - VZEROUPPER - ret - - .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %esi - VPCMPEQ (%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax + /* If remaining length > VEC_SIZE * 2. This works if esi is off + by VEC_SIZE * 4. */ + testl $(VEC_SIZE * 2), %esi + jnz L(last_4x_vec) + + /* length may have been negative or positive by an offset of + VEC_SIZE * 4 depending on where this was called from. This fixes + that. */ + andl $(VEC_SIZE * 4 - 1), %esi testl %eax, %eax + jnz L(last_vec_x1_check) - jnz L(first_vec_x0_check) subl $VEC_SIZE, %esi - jle L(max) + jb L(max) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x1_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - VZEROUPPER - ret - - .p2align 4 -L(first_vec_x0_check): tzcntl %eax, %eax /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + addl $(VEC_SIZE + 1), %eax addq %rdi, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN +# endif .p2align 4 -L(first_vec_x1_check): +L(last_vec_return_x0): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $VEC_SIZE, %rax + subq $(VEC_SIZE * 4 - 1), %rdi addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ shrq $2, %rax -# endif - VZEROUPPER - ret +# endif + VZEROUPPER_RETURN .p2align 4 -L(first_vec_x2_check): +L(last_vec_return_x1): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 2), %rax + subq $(VEC_SIZE * 3 - 1), %rdi addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ shrq $2, %rax -# endif - VZEROUPPER - ret +# endif + VZEROUPPER_RETURN +# ifdef USE_AS_STRNLEN .p2align 4 -L(first_vec_x3_check): +L(last_vec_x1_check): + tzcntl %eax, %eax /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 3), %rax + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + incl %eax addq %rdi, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN - .p2align 4 L(max): movq %r8, %rax + VZEROUPPER_RETURN + + .p2align 4 +L(last_4x_vec): + /* Test first 2x VEC normally. */ + testl %eax, %eax + jnz L(last_vec_x1) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + /* Normalize length. */ + andl $(VEC_SIZE * 4 - 1), %esi + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + subl $(VEC_SIZE * 3), %esi + jb L(max) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + addl $(VEC_SIZE * 3 + 1), %eax + addq %rdi, %rax # ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN - .p2align 4 -L(zero): - xorl %eax, %eax - ret -# endif .p2align 4 -L(first_vec_x0): +L(last_vec_x1): + /* essentially duplicates of first_vec_x1 but use 64 bit + instructions. */ tzcntl %eax, %eax + subq %rdx, %rdi + incl %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ shrq $2, %rax -# endif - VZEROUPPER - ret +# endif + VZEROUPPER_RETURN .p2align 4 -L(first_vec_x1): +L(last_vec_x2): + /* essentially duplicates of first_vec_x1 but use 64 bit + instructions. */ tzcntl %eax, %eax - addq $VEC_SIZE, %rax + subq %rdx, %rdi + addl $(VEC_SIZE + 1), %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ shrq $2, %rax -# endif - VZEROUPPER - ret +# endif + VZEROUPPER_RETURN .p2align 4 -L(first_vec_x2): +L(last_vec_x3): tzcntl %eax, %eax - addq $(VEC_SIZE * 2), %rax + subl $(VEC_SIZE * 2), %esi + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max_end) + subq %rdx, %rdi + addl $(VEC_SIZE * 2 + 1), %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ shrq $2, %rax +# endif + VZEROUPPER_RETURN +L(max_end): + movq %r8, %rax + VZEROUPPER_RETURN # endif - VZEROUPPER - ret + /* Cold case for crossing page with first load. */ .p2align 4 -L(4x_vec_end): - VPCMPEQ %ymm1, %ymm0, %ymm1 +L(cross_page_boundary): + /* Align data to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod rdx. */ + sarxl %edx, %eax, %eax +# ifdef USE_AS_STRNLEN testl %eax, %eax - jnz L(first_vec_x0) - VPCMPEQ %ymm2, %ymm0, %ymm2 - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(first_vec_x1) - VPCMPEQ %ymm3, %ymm0, %ymm3 - vpmovmskb %ymm3, %eax + jnz L(cross_page_less_vec) + leaq 1(%rdi), %rcx + subq %rdx, %rcx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %ecx +# endif + /* Check length. */ + cmpq %rsi, %rcx + jb L(cross_page_continue) + movq %r8, %rax +# else testl %eax, %eax - jnz L(first_vec_x2) - VPCMPEQ %ymm4, %ymm0, %ymm4 - vpmovmskb %ymm4, %eax -L(first_vec_x3): + jz L(cross_page_continue) tzcntl %eax, %eax - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax +# ifdef USE_AS_WCSLEN + /* NB: Divide length by 4 to get wchar_t count. */ + shrl $2, %eax +# endif +# endif +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +# ifdef USE_AS_STRNLEN + .p2align 4 +L(cross_page_less_vec): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %esi +# endif + cmpq %rax, %rsi + cmovb %esi, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN # endif - VZEROUPPER - ret END (STRLEN) #endif diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S new file mode 100644 index 0000000000..4bf6874b82 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-evex.S @@ -0,0 +1,489 @@ +/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifndef STRLEN +# define STRLEN __strlen_evex +# endif + +# define VMOVA vmovdqa64 + +# ifdef USE_AS_WCSLEN +# define VPCMP vpcmpd +# define VPMINU vpminud +# define SHIFT_REG ecx +# define CHAR_SIZE 4 +# else +# define VPCMP vpcmpb +# define VPMINU vpminub +# define SHIFT_REG edx +# define CHAR_SIZE 1 +# endif + +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMM1 ymm17 +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 + +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section .text.evex,"ax",@progbits +ENTRY (STRLEN) +# ifdef USE_AS_STRNLEN + /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi +# endif + mov %RSI_LP, %R8_LP +# endif + movl %edi, %eax + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. */ + andl $(PAGE_SIZE - 1), %eax + /* Check if we may cross page boundary with one vector load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax +# ifdef USE_AS_STRNLEN + /* If length < CHAR_PER_VEC handle special. */ + cmpq $CHAR_PER_VEC, %rsi + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + ret +# ifdef USE_AS_STRNLEN +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ + btsq %rsi, %rax + tzcntl %eax, %eax + ret +# endif + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal CHAR_PER_VEC(%rdi, %rax), %eax +# endif + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax +# endif + ret + + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax +# endif + ret + + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax +# endif + ret + + .p2align 5 +L(aligned_more): + movq %rdi, %rdx + /* Align data to VEC_SIZE. */ + andq $-(VEC_SIZE), %rdi +L(cross_page_continue): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +# ifdef USE_AS_STRNLEN + /* + CHAR_SIZE because it simplies the logic in + last_4x_vec_or_less. */ + leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx + subq %rdx, %rcx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif +# endif + /* Load first VEC regardless. */ + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 +# ifdef USE_AS_STRNLEN + /* Adjust length. If near end handle specially. */ + subq %rcx, %rsi + jb L(last_4x_vec_or_less) +# endif + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + test %eax, %eax + jnz L(first_vec_x2) + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x4) + + addq $VEC_SIZE, %rdi +# ifdef USE_AS_STRNLEN + /* Check if at last VEC_SIZE * 4 length. */ + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif + /* Readjust length. */ + addq %rcx, %rsi +# endif + /* Align data to VEC_SIZE * 4. */ + andq $-(VEC_SIZE * 4), %rdi + + /* Compare 4 * VEC at a time forward. */ + .p2align 4 +L(loop_4x_vec): + /* Load first VEC regardless. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 +# ifdef USE_AS_STRNLEN + /* Break if at end of length. */ + subq $(CHAR_PER_VEC * 4), %rsi + jb L(last_4x_vec_or_less_cmpeq) +# endif + /* Save some code size by microfusing VPMINU with the load. Since + the matches in ymm2/ymm4 can only be returned if there where no + matches in ymm1/ymm3 respectively there is no issue with overlap. + */ + VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 + VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 + + VPCMP $0, %YMM2, %YMMZERO, %k0 + VPCMP $0, %YMM4, %YMMZERO, %k1 + subq $-(VEC_SIZE * 4), %rdi + kortestd %k0, %k1 + jz L(loop_4x_vec) + + /* Check if end was in first half. */ + kmovd %k0, %eax + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + shrq $2, %rdi +# endif + testl %eax, %eax + jz L(second_vec_return) + + VPCMP $0, %YMM1, %YMMZERO, %k2 + kmovd %k2, %edx + /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ +# ifdef USE_AS_WCSLEN + sall $CHAR_PER_VEC, %eax + orl %edx, %eax + tzcntl %eax, %eax +# else + salq $CHAR_PER_VEC, %rax + orq %rdx, %rax + tzcntq %rax, %rax +# endif + addq %rdi, %rax + ret + + +# ifdef USE_AS_STRNLEN + +L(last_4x_vec_or_less_load): + /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 +L(last_4x_vec_or_less_cmpeq): + VPCMP $0, %YMM1, %YMMZERO, %k0 + addq $(VEC_SIZE * 3), %rdi +L(last_4x_vec_or_less): + kmovd %k0, %eax + /* If remaining length > VEC_SIZE * 2. This works if esi is off by + VEC_SIZE * 4. */ + testl $(CHAR_PER_VEC * 2), %esi + jnz L(last_4x_vec) + + /* length may have been negative or positive by an offset of + CHAR_PER_VEC * 4 depending on where this was called from. This + fixes that. */ + andl $(CHAR_PER_VEC * 4 - 1), %esi + testl %eax, %eax + jnz L(last_vec_x1_check) + + /* Check the end of data. */ + subl $CHAR_PER_VEC, %esi + jb L(max) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) + + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi +# endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret +L(max): + movq %r8, %rax + ret +# endif + + /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) + in the 4x VEC loop can use 2 byte encoding. */ + .p2align 4 +L(second_vec_return): + VPCMP $0, %YMM3, %YMMZERO, %k0 + /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ +# ifdef USE_AS_WCSLEN + kunpckbw %k0, %k1, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax +# else + kunpckdq %k0, %k1, %k0 + kmovq %k0, %rax + tzcntq %rax, %rax +# endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret + + +# ifdef USE_AS_STRNLEN +L(last_vec_x1_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi +# endif + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax + ret + + .p2align 4 +L(last_4x_vec): + /* Test first 2x VEC normally. */ + testl %eax, %eax + jnz L(last_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + /* Normalize length. */ + andl $(CHAR_PER_VEC * 4 - 1), %esi + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + /* Check the end of data. */ + subl $(CHAR_PER_VEC * 3), %esi + jb L(max) + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max_end) + + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi +# endif + leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax + ret + + .p2align 4 +L(last_vec_x1): + tzcntl %eax, %eax + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi +# endif + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax + ret + + .p2align 4 +L(last_vec_x2): + tzcntl %eax, %eax + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi +# endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret + + .p2align 4 +L(last_vec_x3): + tzcntl %eax, %eax + subl $(CHAR_PER_VEC * 2), %esi + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max_end) + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi +# endif + leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax + ret +L(max_end): + movq %r8, %rax + ret +# endif + + /* Cold case for crossing page with first load. */ + .p2align 4 +L(cross_page_boundary): + movq %rdi, %rdx + /* Align data to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + /* Remove the leading bytes. */ +# ifdef USE_AS_WCSLEN + /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ + movl %edx, %ecx + shrl $2, %ecx + andl $(CHAR_PER_VEC - 1), %ecx +# endif + /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ + sarxl %SHIFT_REG, %eax, %eax + testl %eax, %eax +# ifndef USE_AS_STRNLEN + jz L(cross_page_continue) + tzcntl %eax, %eax + ret +# else + jnz L(cross_page_less_vec) +# ifndef USE_AS_WCSLEN + movl %edx, %ecx + andl $(CHAR_PER_VEC - 1), %ecx +# endif + movl $CHAR_PER_VEC, %eax + subl %ecx, %eax + /* Check the end of data. */ + cmpq %rax, %rsi + ja L(cross_page_continue) + movl %esi, %eax + ret +L(cross_page_less_vec): + tzcntl %eax, %eax + /* Select min of length and position of first null. */ + cmpq %rax, %rsi + cmovb %esi, %eax + ret +# endif + +END (STRLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S index 055fbbc690..812af73c13 100644 --- a/sysdeps/x86_64/multiarch/strlen-sse2.S +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S @@ -20,4 +20,4 @@ # define strlen __strlen_sse2 #endif -#include "../strlen.S" +#include "strlen-vec.S" diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S new file mode 100644 index 0000000000..439e486a43 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-vec.S @@ -0,0 +1,270 @@ +/* SSE2 version of strlen and SSE4.1 version of wcslen. + Copyright (C) 2012-2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#ifdef AS_WCSLEN +# define PMINU pminud +# define PCMPEQ pcmpeqd +# define SHIFT_RETURN shrq $2, %rax +#else +# define PMINU pminub +# define PCMPEQ pcmpeqb +# define SHIFT_RETURN +#endif + +/* Long lived register in strlen(s), strnlen(s, n) are: + + %xmm3 - zero + %rdi - s + %r10 (s+n) & (~(64-1)) + %r11 s+n +*/ + + +.text +ENTRY(strlen) + +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +#define FIND_ZERO \ + PCMPEQ (%rax), %xmm0; \ + PCMPEQ 16(%rax), %xmm1; \ + PCMPEQ 32(%rax), %xmm2; \ + PCMPEQ 48(%rax), %xmm3; \ + pmovmskb %xmm0, %esi; \ + pmovmskb %xmm1, %edx; \ + pmovmskb %xmm2, %r8d; \ + pmovmskb %xmm3, %ecx; \ + salq $16, %rdx; \ + salq $16, %rcx; \ + orq %rsi, %rdx; \ + orq %r8, %rcx; \ + salq $32, %rcx; \ + orq %rcx, %rdx; + +#ifdef AS_STRNLEN +/* Do not read anything when n==0. */ + test %RSI_LP, %RSI_LP + jne L(n_nonzero) + xor %rax, %rax + ret +L(n_nonzero): +# ifdef AS_WCSLEN +/* Check for overflow from maxlen * sizeof(wchar_t). If it would + overflow the only way this program doesn't have undefined behavior + is if there is a null terminator in valid memory so wcslen will + suffice. */ + mov %RSI_LP, %R10_LP + sar $62, %R10_LP + test %R10_LP, %R10_LP + jnz __wcslen_sse4_1 + sal $2, %RSI_LP +# endif + + +/* Initialize long lived registers. */ + + add %RDI_LP, %RSI_LP +# ifdef AS_WCSLEN +/* Check for overflow again from s + maxlen * sizeof(wchar_t). */ + jbe __wcslen_sse4_1 +# endif + mov %RSI_LP, %R10_LP + and $-64, %R10_LP + mov %RSI_LP, %R11_LP +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + movq %rdi, %rax + movq %rdi, %rcx + andq $4095, %rcx +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ + cmpq $4047, %rcx +/* We cannot unify this branching as it would be ~6 cycles slower. */ + ja L(cross_page) + +#ifdef AS_STRNLEN +/* Test if end is among first 64 bytes. */ +# define STRNLEN_PROLOG \ + mov %r11, %rsi; \ + subq %rax, %rsi; \ + andq $-64, %rax; \ + testq $-64, %rsi; \ + je L(strnlen_ret) +#else +# define STRNLEN_PROLOG andq $-64, %rax; +#endif + +/* Ignore bits in mask that come before start of string. */ +#define PROLOG(lab) \ + movq %rdi, %rcx; \ + xorq %rax, %rcx; \ + STRNLEN_PROLOG; \ + sarq %cl, %rdx; \ + test %rdx, %rdx; \ + je L(lab); \ + bsfq %rdx, %rax; \ + SHIFT_RETURN; \ + ret + +#ifdef AS_STRNLEN + andq $-16, %rax + FIND_ZERO +#else + /* Test first 16 bytes unaligned. */ + movdqu (%rax), %xmm4 + PCMPEQ %xmm0, %xmm4 + pmovmskb %xmm4, %edx + test %edx, %edx + je L(next48_bytes) + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + SHIFT_RETURN + ret + +L(next48_bytes): +/* Same as FIND_ZERO except we do not check first 16 bytes. */ + andq $-16, %rax + PCMPEQ 16(%rax), %xmm1 + PCMPEQ 32(%rax), %xmm2 + PCMPEQ 48(%rax), %xmm3 + pmovmskb %xmm1, %edx + pmovmskb %xmm2, %r8d + pmovmskb %xmm3, %ecx + salq $16, %rdx + salq $16, %rcx + orq %r8, %rcx + salq $32, %rcx + orq %rcx, %rdx +#endif + + /* When no zero byte is found xmm1-3 are zero so we do not have to + zero them. */ + PROLOG(loop) + + .p2align 4 +L(cross_page): + andq $-64, %rax + FIND_ZERO + PROLOG(loop_init) + +#ifdef AS_STRNLEN +/* We must do this check to correctly handle strnlen (s, -1). */ +L(strnlen_ret): + bts %rsi, %rdx + sarq %cl, %rdx + test %rdx, %rdx + je L(loop_init) + bsfq %rdx, %rax + SHIFT_RETURN + ret +#endif + .p2align 4 +L(loop_init): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 +#ifdef AS_STRNLEN + .p2align 4 +L(loop): + + addq $64, %rax + cmpq %rax, %r10 + je L(exit_end) + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit) + jmp L(loop) + + .p2align 4 +L(exit_end): + cmp %rax, %r11 + je L(first) /* Do not read when end is at page boundary. */ + pxor %xmm0, %xmm0 + FIND_ZERO + +L(first): + bts %r11, %rdx + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + + .p2align 4 +L(exit): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +#else + + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ + .p2align 4 +L(loop): + + movdqa 64(%rax), %xmm0 + PMINU 80(%rax), %xmm0 + PMINU 96(%rax), %xmm0 + PMINU 112(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit64) + + subq $-128, %rax + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit0) + jmp L(loop) + + .p2align 4 +L(exit64): + addq $64, %rax +L(exit0): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +#endif + +END(strlen) diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S new file mode 100644 index 0000000000..0dcea18dbb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_avx2_rtm +#include "strcat-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S new file mode 100644 index 0000000000..8884f02371 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-evex.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_evex +#include "strcat-evex.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S new file mode 100644 index 0000000000..37d1224bb9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCMP __strncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S new file mode 100644 index 0000000000..a1d53e8c9f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-evex.S @@ -0,0 +1,3 @@ +#define STRCMP __strncmp_evex +#define USE_AS_STRNCMP 1 +#include "strcmp-evex.S" diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c index 3c94b3ffd9..7accba2b7c 100644 --- a/sysdeps/x86_64/multiarch/strncmp.c +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -30,16 +30,29 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S new file mode 100644 index 0000000000..79e7083299 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S new file mode 100644 index 0000000000..40e391f0da --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_evex +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S new file mode 100644 index 0000000000..04f1626a5c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __strnlen_avx2_rtm +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S new file mode 100644 index 0000000000..722022f303 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S @@ -0,0 +1,4 @@ +#define STRLEN __strnlen_evex +#define USE_AS_STRNLEN 1 + +#include "strlen-evex.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S new file mode 100644 index 0000000000..5def14ec1c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRRCHR +# define STRRCHR __strrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index 146bdd51d0..ad91fab991 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -36,9 +36,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRRCHR) movd %esi, %xmm4 movl %edi, %ecx @@ -166,8 +170,8 @@ L(return_value): # endif bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(match): @@ -198,8 +202,7 @@ L(find_nul): jz L(return_value) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char_and_nul): @@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec): jz L(return_null) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_null): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S new file mode 100644 index 0000000000..f920b5a584 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S @@ -0,0 +1,265 @@ +/* strrchr/wcsrchr optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifndef STRRCHR +# define STRRCHR __strrchr_evex +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# ifdef USE_AS_WCSRCHR +# define VPBROADCAST vpbroadcastd +# define VPCMP vpcmpd +# define SHIFT_REG r8d +# else +# define VPBROADCAST vpbroadcastb +# define VPCMP vpcmpb +# define SHIFT_REG ecx +# endif + +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMMMATCH ymm17 +# define YMM1 ymm18 + +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (STRRCHR) + movl %edi, %ecx + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx + cmpl $VEC_SIZE, %ecx + ja L(cros_page_boundary) + + VMOVU (%rdi), %YMM1 + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + + addq $VEC_SIZE, %rdi + + testl %eax, %eax + jnz L(first_vec) + + testl %ecx, %ecx + jnz L(return_null) + + andq $-VEC_SIZE, %rdi + xorl %edx, %edx + jmp L(aligned_loop) + + .p2align 4 +L(first_vec): + /* Check if there is a null byte. */ + testl %ecx, %ecx + jnz L(char_and_nul_in_first_vec) + + /* Remember the match and keep searching. */ + movl %eax, %edx + movq %rdi, %rsi + andq $-VEC_SIZE, %rdi + jmp L(aligned_loop) + + .p2align 4 +L(cros_page_boundary): + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + +# ifdef USE_AS_WCSRCHR + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG + sarl $2, %SHIFT_REG +# endif + + VMOVA (%rdi), %YMM1 + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %edx + kmovd %k1, %eax + + shrxl %SHIFT_REG, %edx, %edx + shrxl %SHIFT_REG, %eax, %eax + addq $VEC_SIZE, %rdi + + /* Check if there is a CHAR. */ + testl %eax, %eax + jnz L(found_char) + + testl %edx, %edx + jnz L(return_null) + + jmp L(aligned_loop) + + .p2align 4 +L(found_char): + testl %edx, %edx + jnz L(char_and_nul) + + /* Remember the match and keep searching. */ + movl %eax, %edx + leaq (%rdi, %rcx), %rsi + + .p2align 4 +L(aligned_loop): + VMOVA (%rdi), %YMM1 + addq $VEC_SIZE, %rdi + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + orl %eax, %ecx + jnz L(char_nor_null) + + VMOVA (%rdi), %YMM1 + add $VEC_SIZE, %rdi + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + orl %eax, %ecx + jnz L(char_nor_null) + + VMOVA (%rdi), %YMM1 + addq $VEC_SIZE, %rdi + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + orl %eax, %ecx + jnz L(char_nor_null) + + VMOVA (%rdi), %YMM1 + addq $VEC_SIZE, %rdi + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + orl %eax, %ecx + jz L(aligned_loop) + + .p2align 4 +L(char_nor_null): + /* Find a CHAR or a null byte in a loop. */ + testl %eax, %eax + jnz L(match) +L(return_value): + testl %edx, %edx + jz L(return_null) + movl %edx, %eax + movq %rsi, %rdi + bsrl %eax, %eax +# ifdef USE_AS_WCSRCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq -VEC_SIZE(%rdi, %rax, 4), %rax +# else + leaq -VEC_SIZE(%rdi, %rax), %rax +# endif + ret + + .p2align 4 +L(match): + /* Find a CHAR. Check if there is a null byte. */ + kmovd %k0, %ecx + testl %ecx, %ecx + jnz L(find_nul) + + /* Remember the match and keep searching. */ + movl %eax, %edx + movq %rdi, %rsi + jmp L(aligned_loop) + + .p2align 4 +L(find_nul): + /* Mask out any matching bits after the null byte. */ + movl %ecx, %r8d + subl $1, %r8d + xorl %ecx, %r8d + andl %r8d, %eax + testl %eax, %eax + /* If there is no CHAR here, return the remembered one. */ + jz L(return_value) + bsrl %eax, %eax +# ifdef USE_AS_WCSRCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq -VEC_SIZE(%rdi, %rax, 4), %rax +# else + leaq -VEC_SIZE(%rdi, %rax), %rax +# endif + ret + + .p2align 4 +L(char_and_nul): + /* Find both a CHAR and a null byte. */ + addq %rcx, %rdi + movl %edx, %ecx +L(char_and_nul_in_first_vec): + /* Mask out any matching bits after the null byte. */ + movl %ecx, %r8d + subl $1, %r8d + xorl %ecx, %r8d + andl %r8d, %eax + testl %eax, %eax + /* Return null pointer if the null byte comes first. */ + jz L(return_null) + bsrl %eax, %eax +# ifdef USE_AS_WCSRCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq -VEC_SIZE(%rdi, %rax, 4), %rax +# else + leaq -VEC_SIZE(%rdi, %rax), %rax +# endif + ret + + .p2align 4 +L(return_null): + xorl %eax, %eax + ret + +END (STRRCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S new file mode 100644 index 0000000000..d49dbbf0b4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __wcschr_avx2_rtm +#define USE_AS_WCSCHR 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S new file mode 100644 index 0000000000..7cb8f1e41a --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-evex.S @@ -0,0 +1,3 @@ +#define STRCHR __wcschr_evex +#define USE_AS_WCSCHR 1 +#include "strchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S new file mode 100644 index 0000000000..d6ca2b8064 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRCMP __wcscmp_avx2_rtm +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S new file mode 100644 index 0000000000..42e73e51eb --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S @@ -0,0 +1,4 @@ +#define STRCMP __wcscmp_evex +#define USE_AS_WCSCMP 1 + +#include "strcmp-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S new file mode 100644 index 0000000000..35658d7365 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __wcslen_avx2_rtm +#define USE_AS_WCSLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S new file mode 100644 index 0000000000..bdafa83bd5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-evex.S @@ -0,0 +1,4 @@ +#define STRLEN __wcslen_evex +#define USE_AS_WCSLEN 1 + +#include "strlen-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S new file mode 100644 index 0000000000..7e62621afc --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S @@ -0,0 +1,4 @@ +#define AS_WCSLEN +#define strlen __wcslen_sse4_1 + +#include "strlen-vec.S" diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c index bb97438c7f..26b5fdffd6 100644 --- a/sysdeps/x86_64/multiarch/wcslen.c +++ b/sysdeps/x86_64/multiarch/wcslen.c @@ -24,7 +24,7 @@ # undef __wcslen # define SYMBOL_NAME wcslen -# include "ifunc-avx2.h" +# include "ifunc-wcslen.h" libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ()); weak_alias (__wcslen, wcslen); diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S new file mode 100644 index 0000000000..4e88c70cc6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRCMP __wcsncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S new file mode 100644 index 0000000000..8a8e310713 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S @@ -0,0 +1,5 @@ +#define STRCMP __wcsncmp_evex +#define USE_AS_STRNCMP 1 +#define USE_AS_WCSCMP 1 + +#include "strcmp-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S new file mode 100644 index 0000000000..7437ebee2d --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRLEN __wcsnlen_avx2_rtm +#define USE_AS_WCSLEN 1 +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S new file mode 100644 index 0000000000..24773bb4e2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S @@ -0,0 +1,5 @@ +#define STRLEN __wcsnlen_evex +#define USE_AS_WCSLEN 1 +#define USE_AS_STRNLEN 1 + +#include "strlen-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S index a8cab0cb00..5fa51fe07c 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S @@ -2,4 +2,4 @@ #define AS_STRNLEN #define strlen __wcsnlen_sse4_1 -#include "../strlen.S" +#include "strlen-vec.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c index 52e7e5d4f3..f15c1b328b 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen.c +++ b/sysdeps/x86_64/multiarch/wcsnlen.c @@ -24,27 +24,7 @@ # undef __wcsnlen # define SYMBOL_NAME wcsnlen -# include - -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; - -static inline void * -IFUNC_SELECTOR (void) -{ - const struct cpu_features* cpu_features = __get_cpu_features (); - - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); - - if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) - return OPTIMIZE (sse4_1); - - return OPTIMIZE (sse2); -} +# include "ifunc-wcslen.h" libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); weak_alias (__wcsnlen, wcsnlen); diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S new file mode 100644 index 0000000000..9bf760833f --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRRCHR __wcsrchr_avx2_rtm +#define USE_AS_WCSRCHR 1 +#include "strrchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S new file mode 100644 index 0000000000..c64602f7dc --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S @@ -0,0 +1,3 @@ +#define STRRCHR __wcsrchr_evex +#define USE_AS_WCSRCHR 1 +#include "strrchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S new file mode 100644 index 0000000000..58ed21db01 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __wmemchr_avx2_rtm +#define USE_AS_WMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S new file mode 100644 index 0000000000..06cd0f9f5a --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S @@ -0,0 +1,4 @@ +#define MEMCHR __wmemchr_evex +#define USE_AS_WMEMCHR 1 + +#include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000000..31104d1215 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_avx2_movbe_rtm +#define USE_AS_WMEMCMP 1 + +#include "memcmp-avx2-movbe-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S new file mode 100644 index 0000000000..4726d74aa1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_evex_movbe +#define USE_AS_WMEMCMP 1 + +#include "memcmp-evex-movbe.S" diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index 2e226d0d55..8422c15cc8 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,5 +1,5 @@ -/* SSE2 version of strlen/wcslen. - Copyright (C) 2012-2020 Free Software Foundation, Inc. +/* SSE2 version of strlen. + Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,243 +16,6 @@ License along with the GNU C Library; if not, see . */ -#include +#include "multiarch/strlen-vec.S" -#ifdef AS_WCSLEN -# define PMINU pminud -# define PCMPEQ pcmpeqd -# define SHIFT_RETURN shrq $2, %rax -#else -# define PMINU pminub -# define PCMPEQ pcmpeqb -# define SHIFT_RETURN -#endif - -/* Long lived register in strlen(s), strnlen(s, n) are: - - %xmm3 - zero - %rdi - s - %r10 (s+n) & (~(64-1)) - %r11 s+n -*/ - - -.text -ENTRY(strlen) - -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ -#define FIND_ZERO \ - PCMPEQ (%rax), %xmm0; \ - PCMPEQ 16(%rax), %xmm1; \ - PCMPEQ 32(%rax), %xmm2; \ - PCMPEQ 48(%rax), %xmm3; \ - pmovmskb %xmm0, %esi; \ - pmovmskb %xmm1, %edx; \ - pmovmskb %xmm2, %r8d; \ - pmovmskb %xmm3, %ecx; \ - salq $16, %rdx; \ - salq $16, %rcx; \ - orq %rsi, %rdx; \ - orq %r8, %rcx; \ - salq $32, %rcx; \ - orq %rcx, %rdx; - -#ifdef AS_STRNLEN -/* Do not read anything when n==0. */ - test %RSI_LP, %RSI_LP - jne L(n_nonzero) - xor %rax, %rax - ret -L(n_nonzero): -# ifdef AS_WCSLEN - shl $2, %RSI_LP -# endif - -/* Initialize long lived registers. */ - - add %RDI_LP, %RSI_LP - mov %RSI_LP, %R10_LP - and $-64, %R10_LP - mov %RSI_LP, %R11_LP -#endif - - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - movq %rdi, %rax - movq %rdi, %rcx - andq $4095, %rcx -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ - cmpq $4047, %rcx -/* We cannot unify this branching as it would be ~6 cycles slower. */ - ja L(cross_page) - -#ifdef AS_STRNLEN -/* Test if end is among first 64 bytes. */ -# define STRNLEN_PROLOG \ - mov %r11, %rsi; \ - subq %rax, %rsi; \ - andq $-64, %rax; \ - testq $-64, %rsi; \ - je L(strnlen_ret) -#else -# define STRNLEN_PROLOG andq $-64, %rax; -#endif - -/* Ignore bits in mask that come before start of string. */ -#define PROLOG(lab) \ - movq %rdi, %rcx; \ - xorq %rax, %rcx; \ - STRNLEN_PROLOG; \ - sarq %cl, %rdx; \ - test %rdx, %rdx; \ - je L(lab); \ - bsfq %rdx, %rax; \ - SHIFT_RETURN; \ - ret - -#ifdef AS_STRNLEN - andq $-16, %rax - FIND_ZERO -#else - /* Test first 16 bytes unaligned. */ - movdqu (%rax), %xmm4 - PCMPEQ %xmm0, %xmm4 - pmovmskb %xmm4, %edx - test %edx, %edx - je L(next48_bytes) - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ - SHIFT_RETURN - ret - -L(next48_bytes): -/* Same as FIND_ZERO except we do not check first 16 bytes. */ - andq $-16, %rax - PCMPEQ 16(%rax), %xmm1 - PCMPEQ 32(%rax), %xmm2 - PCMPEQ 48(%rax), %xmm3 - pmovmskb %xmm1, %edx - pmovmskb %xmm2, %r8d - pmovmskb %xmm3, %ecx - salq $16, %rdx - salq $16, %rcx - orq %r8, %rcx - salq $32, %rcx - orq %rcx, %rdx -#endif - - /* When no zero byte is found xmm1-3 are zero so we do not have to - zero them. */ - PROLOG(loop) - - .p2align 4 -L(cross_page): - andq $-64, %rax - FIND_ZERO - PROLOG(loop_init) - -#ifdef AS_STRNLEN -/* We must do this check to correctly handle strnlen (s, -1). */ -L(strnlen_ret): - bts %rsi, %rdx - sarq %cl, %rdx - test %rdx, %rdx - je L(loop_init) - bsfq %rdx, %rax - SHIFT_RETURN - ret -#endif - .p2align 4 -L(loop_init): - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 -#ifdef AS_STRNLEN - .p2align 4 -L(loop): - - addq $64, %rax - cmpq %rax, %r10 - je L(exit_end) - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit) - jmp L(loop) - - .p2align 4 -L(exit_end): - cmp %rax, %r11 - je L(first) /* Do not read when end is at page boundary. */ - pxor %xmm0, %xmm0 - FIND_ZERO - -L(first): - bts %r11, %rdx - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - - .p2align 4 -L(exit): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#else - - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ - .p2align 4 -L(loop): - - movdqa 64(%rax), %xmm0 - PMINU 80(%rax), %xmm0 - PMINU 96(%rax), %xmm0 - PMINU 112(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit64) - - subq $-128, %rax - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit0) - jmp L(loop) - - .p2align 4 -L(exit64): - addq $64, %rax -L(exit0): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#endif - -END(strlen) libc_hidden_builtin_def (strlen) diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h index 0b73674f68..c8ad778fee 100644 --- a/sysdeps/x86_64/sysdep.h +++ b/sysdeps/x86_64/sysdep.h @@ -95,6 +95,28 @@ lose: \ #define R14_LP r14 #define R15_LP r15 +/* Zero upper vector registers and return with xtest. NB: Use VZEROALL + to avoid RTM abort triggered by VZEROUPPER inside transactionally. */ +#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \ + xtest; \ + jz 1f; \ + vzeroall; \ + ret; \ +1: \ + vzeroupper; \ + ret + +/* Zero upper vector registers and return. */ +#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + VZEROUPPER; \ + ret +#endif + +#ifndef VZEROUPPER_RETURN +# define VZEROUPPER_RETURN VZEROUPPER; ret +#endif + #else /* __ASSEMBLER__ */ /* Long and pointer size in bytes. */ diff --git a/sysdeps/x86_64/tst-rsi-strlen.c b/sysdeps/x86_64/tst-rsi-strlen.c new file mode 100644 index 0000000000..a80c4f85c2 --- /dev/null +++ b/sysdeps/x86_64/tst-rsi-strlen.c @@ -0,0 +1,81 @@ +/* Test strlen with 0 in the RSI register. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifdef WIDE +# define TEST_NAME "wcslen" +#else +# define TEST_NAME "strlen" +#endif /* WIDE */ + +#define TEST_MAIN +#include + +#ifdef WIDE +# include +# define STRLEN wcslen +# define CHAR wchar_t +#else +# define STRLEN strlen +# define CHAR char +#endif /* WIDE */ + +IMPL (STRLEN, 1) + +typedef size_t (*proto_t) (const CHAR *); + +typedef struct +{ + void (*fn) (void); +} parameter_t; + +size_t +__attribute__ ((weak, noinline, noclone)) +do_strlen (parameter_t *a, int zero, const CHAR *str) +{ + return CALL (a, str); +} + +static int +test_main (void) +{ + test_init (); + + size_t size = page_size / sizeof (CHAR) - 1; + CHAR *buf = (CHAR *) buf2; + buf[size] = 0; + + parameter_t a; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + a.fn = impl->fn; + /* NB: Pass 0 in RSI. */ + size_t res = do_strlen (&a, 0, buf); + if (res != size) + { + error (0, 0, "Wrong result in function %s: %zu != %zu", + impl->name, res, size); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include diff --git a/sysdeps/x86_64/tst-rsi-wcslen.c b/sysdeps/x86_64/tst-rsi-wcslen.c new file mode 100644 index 0000000000..f45a7dfb51 --- /dev/null +++ b/sysdeps/x86_64/tst-rsi-wcslen.c @@ -0,0 +1,20 @@ +/* Test wcslen with 0 in the RSI register. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define WIDE 1 +#include "tst-rsi-strlen.c" diff --git a/sysvipc/test-sysvsem.c b/sysvipc/test-sysvsem.c index 01dbff343a..b7284e0b48 100644 --- a/sysvipc/test-sysvsem.c +++ b/sysvipc/test-sysvsem.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/version.h b/version.h index 83cd196798..e6ca7a8857 100644 --- a/version.h +++ b/version.h @@ -1,4 +1,4 @@ /* This file just defines the current version number of libc. */ -#define RELEASE "release" +#define RELEASE "stable" #define VERSION "2.32" diff -pruN glibc-2.32.orig/sysdeps/unix/sysv/linux/x86_64/64/configure glibc-2.32/sysdeps/unix/sysv/linux/x86_64/64/configure --- glibc-2.32.orig/sysdeps/unix/sysv/linux/x86_64/64/configure 2021-09-18 21:02:32.741186019 +1000 +++ glibc-2.32/sysdeps/unix/sysv/linux/x86_64/64/configure 2021-09-18 21:03:05.314302356 +1000 @@ -4,10 +4,10 @@ test -n "$libc_cv_slibdir" || case "$prefix" in /usr | /usr/) - libc_cv_slibdir='/lib64' - libc_cv_rtlddir='/lib64' + libc_cv_slibdir='/lib' + libc_cv_rtlddir='/lib' if test "$libdir" = '${exec_prefix}/lib'; then - libdir='${exec_prefix}/lib64'; + libdir='${exec_prefix}/lib'; # Locale data can be shared between 32-bit and 64-bit libraries. libc_cv_complocaledir='${exec_prefix}/lib/locale' fi diff -pruN glibc-2.32.orig/sysdeps/unix/sysv/linux/x86_64/ldconfig.h glibc-2.32/sysdeps/unix/sysv/linux/x86_64/ldconfig.h --- glibc-2.32.orig/sysdeps/unix/sysv/linux/x86_64/ldconfig.h 2021-09-18 21:02:32.742186053 +1000 +++ glibc-2.32/sysdeps/unix/sysv/linux/x86_64/ldconfig.h 2021-09-18 21:03:05.314302356 +1000 @@ -18,9 +18,9 @@ #include #define SYSDEP_KNOWN_INTERPRETER_NAMES \ - { "/lib/ld-linux.so.2", FLAG_ELF_LIBC6 }, \ + { "/lib32/ld-linux.so.2", FLAG_ELF_LIBC6 }, \ { "/libx32/ld-linux-x32.so.2", FLAG_ELF_LIBC6 }, \ - { "/lib64/ld-linux-x86-64.so.2", FLAG_ELF_LIBC6 }, + { "/lib/ld-linux-x86-64.so.2", FLAG_ELF_LIBC6 }, #define SYSDEP_KNOWN_LIBRARY_NAMES \ { "libc.so.6", FLAG_ELF_LIBC6 }, \ { "libm.so.6", FLAG_ELF_LIBC6 },